2 * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
62 #include <machine/limits.h>
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/zalloc.h>
67 #include <kern/task.h>
68 #include <kern/sched_prim.h>
70 #include <vm/vm_map.h>
72 #include <libkern/OSAtomic.h>
74 #include <sys/kdebug.h>
75 #define AIO_work_queued 1
76 #define AIO_worker_wake 2
77 #define AIO_completion_sig 3
78 #define AIO_completion_cleanup_wait 4
79 #define AIO_completion_cleanup_wake 5
80 #define AIO_completion_suspend_wake 6
81 #define AIO_fsync_delay 7
83 #define AIO_cancel_async_workq 11
84 #define AIO_cancel_sync_workq 12
85 #define AIO_cancel_activeq 13
86 #define AIO_cancel_doneq 14
92 #define AIO_error_val 61
93 #define AIO_error_activeq 62
94 #define AIO_error_workq 63
96 #define AIO_return_val 71
97 #define AIO_return_activeq 72
98 #define AIO_return_workq 73
101 #define AIO_exit_sleep 91
102 #define AIO_close 100
103 #define AIO_close_sleep 101
104 #define AIO_suspend 110
105 #define AIO_suspend_sleep 111
106 #define AIO_worker_thread 120
110 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
114 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
115 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
116 * (proc.aio_activeq) when one of our worker threads start the IO.
117 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
118 * when the IO request completes. The request remains on aio_doneq until
119 * user process calls aio_return or the process exits, either way that is our
120 * trigger to release aio resources.
122 typedef struct aio_workq
{
123 TAILQ_HEAD(, aio_workq_entry
) aioq_entries
;
126 wait_queue_t aioq_waitq
;
129 #define AIO_NUM_WORK_QUEUES 1
132 volatile int32_t aio_inflight_count
; /* entries that have been taken from a workq */
133 volatile int32_t aio_done_count
; /* entries on all done queues (proc.aio_doneq) */
134 volatile int32_t aio_total_count
; /* total extant entries */
136 /* Hash table of queues here */
138 struct aio_workq aio_async_workqs
[AIO_NUM_WORK_QUEUES
];
140 typedef struct aio_anchor_cb aio_anchor_cb
;
142 struct aio_lio_context
148 typedef struct aio_lio_context aio_lio_context
;
152 * Notes on aio sleep / wake channels.
153 * We currently pick a couple fields within the proc structure that will allow
154 * us sleep channels that currently do not collide with any other kernel routines.
155 * At this time, for binary compatibility reasons, we cannot create new proc fields.
157 #define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
158 #define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
160 #define ASSERT_AIO_FROM_PROC(aiop, theproc) \
161 if ((aiop)->procp != (theproc)) { \
162 panic("AIO on a proc list that does not belong to that proc.\n"); \
168 static void aio_proc_lock(proc_t procp
);
169 static void aio_proc_lock_spin(proc_t procp
);
170 static void aio_proc_unlock(proc_t procp
);
171 static lck_mtx_t
* aio_proc_mutex(proc_t procp
);
172 static void aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
173 static void aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
174 static int aio_get_process_count(proc_t procp
);
175 static int aio_active_requests_for_process(proc_t procp
);
176 static int aio_proc_active_requests_for_file(proc_t procp
, int fd
);
177 static boolean_t
is_already_queued(proc_t procp
, user_addr_t aiocbp
);
178 static boolean_t
should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
);
180 static void aio_entry_lock(aio_workq_entry
*entryp
);
181 static void aio_entry_lock_spin(aio_workq_entry
*entryp
);
182 static aio_workq_t
aio_entry_workq(aio_workq_entry
*entryp
);
183 static lck_mtx_t
* aio_entry_mutex(__unused aio_workq_entry
*entryp
);
184 static void aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
185 static void aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
186 static void aio_entry_ref_locked(aio_workq_entry
*entryp
);
187 static void aio_entry_unref_locked(aio_workq_entry
*entryp
);
188 static void aio_entry_ref(aio_workq_entry
*entryp
);
189 static void aio_entry_unref(aio_workq_entry
*entryp
);
190 static void aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
,
191 int wait_for_completion
, boolean_t disable_notification
);
192 static int aio_entry_try_workq_remove(aio_workq_entry
*entryp
);
193 static boolean_t
aio_delay_fsync_request( aio_workq_entry
*entryp
);
194 static int aio_free_request(aio_workq_entry
*entryp
);
196 static void aio_workq_init(aio_workq_t wq
);
197 static void aio_workq_lock_spin(aio_workq_t wq
);
198 static void aio_workq_unlock(aio_workq_t wq
);
199 static lck_mtx_t
* aio_workq_mutex(aio_workq_t wq
);
201 static void aio_work_thread( void );
202 static aio_workq_entry
*aio_get_some_work( void );
204 static int aio_get_all_queues_count( void );
205 static int aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
);
206 static int aio_validate( aio_workq_entry
*entryp
);
207 static int aio_increment_total_count(void);
208 static int aio_decrement_total_count(void);
210 static int do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
, int wait_for_completion
, boolean_t disable_notification
);
211 static void do_aio_completion( aio_workq_entry
*entryp
);
212 static int do_aio_fsync( aio_workq_entry
*entryp
);
213 static int do_aio_read( aio_workq_entry
*entryp
);
214 static int do_aio_write( aio_workq_entry
*entryp
);
215 static void do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
216 static void do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
217 static int lio_create_entry(proc_t procp
,
220 aio_workq_entry
**entrypp
);
221 static aio_workq_entry
*aio_create_queue_entry(proc_t procp
,
225 static user_addr_t
*aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
);
226 static void free_lio_context(aio_lio_context
* context
);
227 static void aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
);
229 #define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
230 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
231 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
234 * EXTERNAL PROTOTYPES
237 /* in ...bsd/kern/sys_generic.c */
238 extern int dofileread(vfs_context_t ctx
, struct fileproc
*fp
,
239 user_addr_t bufp
, user_size_t nbyte
,
240 off_t offset
, int flags
, user_ssize_t
*retval
);
241 extern int dofilewrite(vfs_context_t ctx
, struct fileproc
*fp
,
242 user_addr_t bufp
, user_size_t nbyte
, off_t offset
,
243 int flags
, user_ssize_t
*retval
);
245 static uint32_t lio_contexts_alloced
= 0;
249 * aio external global variables.
251 extern int aio_max_requests
; /* AIO_MAX - configurable */
252 extern int aio_max_requests_per_process
; /* AIO_PROCESS_MAX - configurable */
253 extern int aio_worker_threads
; /* AIO_THREAD_COUNT - configurable */
257 * aio static variables.
259 static aio_anchor_cb aio_anchor
;
260 static lck_grp_t
*aio_proc_lock_grp
;
261 static lck_grp_t
*aio_entry_lock_grp
;
262 static lck_grp_t
*aio_queue_lock_grp
;
263 static lck_attr_t
*aio_lock_attr
;
264 static lck_grp_attr_t
*aio_lock_grp_attr
;
265 static struct zone
*aio_workq_zonep
;
266 static lck_mtx_t aio_entry_mtx
;
267 static lck_mtx_t aio_proc_mtx
;
270 aio_entry_lock(__unused aio_workq_entry
*entryp
)
272 lck_mtx_lock(&aio_entry_mtx
);
276 aio_entry_lock_spin(__unused aio_workq_entry
*entryp
)
278 lck_mtx_lock_spin(&aio_entry_mtx
);
282 aio_entry_unlock(__unused aio_workq_entry
*entryp
)
284 lck_mtx_unlock(&aio_entry_mtx
);
289 aio_entry_workq(__unused aio_workq_entry
*entryp
)
291 return &aio_anchor
.aio_async_workqs
[0];
295 aio_entry_mutex(__unused aio_workq_entry
*entryp
)
297 return &aio_entry_mtx
;
301 aio_workq_init(aio_workq_t wq
)
303 TAILQ_INIT(&wq
->aioq_entries
);
305 lck_mtx_init(&wq
->aioq_mtx
, aio_queue_lock_grp
, aio_lock_attr
);
306 wq
->aioq_waitq
= wait_queue_alloc(SYNC_POLICY_FIFO
);
311 * Can be passed a queue which is locked spin.
314 aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
316 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
318 if (entryp
->aio_workq_link
.tqe_prev
== NULL
) {
319 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
322 TAILQ_REMOVE(&queue
->aioq_entries
, entryp
, aio_workq_link
);
324 entryp
->aio_workq_link
.tqe_prev
= NULL
; /* Not on a workq */
326 if (queue
->aioq_count
< 0) {
327 panic("Negative count on a queue.\n");
332 aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
334 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
336 TAILQ_INSERT_TAIL(&queue
->aioq_entries
, entryp
, aio_workq_link
);
337 if (queue
->aioq_count
< 0) {
338 panic("Negative count on a queue.\n");
344 aio_proc_lock(proc_t procp
)
346 lck_mtx_lock(aio_proc_mutex(procp
));
350 aio_proc_lock_spin(proc_t procp
)
352 lck_mtx_lock_spin(aio_proc_mutex(procp
));
356 aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
358 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
360 TAILQ_REMOVE(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
361 TAILQ_INSERT_TAIL( &procp
->p_aio_doneq
, entryp
, aio_proc_link
);
362 procp
->p_aio_active_count
--;
363 OSIncrementAtomic(&aio_anchor
.aio_done_count
);
367 aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
369 TAILQ_REMOVE(&procp
->p_aio_doneq
, entryp
, aio_proc_link
);
370 OSDecrementAtomic(&aio_anchor
.aio_done_count
);
371 aio_decrement_total_count();
372 procp
->p_aio_total_count
--;
376 aio_proc_unlock(proc_t procp
)
378 lck_mtx_unlock(aio_proc_mutex(procp
));
382 aio_proc_mutex(proc_t procp
)
384 return &procp
->p_mlock
;
388 aio_entry_ref_locked(aio_workq_entry
*entryp
)
390 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
392 if (entryp
->aio_refcount
< 0) {
393 panic("AIO workq entry with a negative refcount.\n");
395 entryp
->aio_refcount
++;
399 /* Return 1 if you've freed it */
401 aio_entry_unref_locked(aio_workq_entry
*entryp
)
403 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
405 entryp
->aio_refcount
--;
406 if (entryp
->aio_refcount
< 0) {
407 panic("AIO workq entry with a negative refcount.\n");
412 aio_entry_ref(aio_workq_entry
*entryp
)
414 aio_entry_lock_spin(entryp
);
415 aio_entry_ref_locked(entryp
);
416 aio_entry_unlock(entryp
);
419 aio_entry_unref(aio_workq_entry
*entryp
)
421 aio_entry_lock_spin(entryp
);
422 aio_entry_unref_locked(entryp
);
424 if ((entryp
->aio_refcount
== 0) && ((entryp
->flags
& AIO_DO_FREE
) != 0)) {
425 aio_entry_unlock(entryp
);
426 aio_free_request(entryp
);
428 aio_entry_unlock(entryp
);
435 aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
, int wait_for_completion
, boolean_t disable_notification
)
437 aio_entry_lock_spin(entryp
);
440 aio_entry_ref_locked(entryp
);
441 entryp
->errorval
= ECANCELED
;
442 entryp
->returnval
= -1;
445 if ( wait_for_completion
) {
446 entryp
->flags
|= wait_for_completion
; /* flag for special completion processing */
449 if ( disable_notification
) {
450 entryp
->flags
|= AIO_DISABLE
; /* Don't want a signal */
453 aio_entry_unlock(entryp
);
457 aio_entry_try_workq_remove(aio_workq_entry
*entryp
)
459 /* Can only be cancelled if it's still on a work queue */
460 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
463 /* Will have to check again under the lock */
464 queue
= aio_entry_workq(entryp
);
465 aio_workq_lock_spin(queue
);
466 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
467 aio_workq_remove_entry_locked(queue
, entryp
);
468 aio_workq_unlock(queue
);
471 aio_workq_unlock(queue
);
479 aio_workq_lock_spin(aio_workq_t wq
)
481 lck_mtx_lock_spin(aio_workq_mutex(wq
));
485 aio_workq_unlock(aio_workq_t wq
)
487 lck_mtx_unlock(aio_workq_mutex(wq
));
491 aio_workq_mutex(aio_workq_t wq
)
493 return &wq
->aioq_mtx
;
497 * aio_cancel - attempt to cancel one or more async IO requests currently
498 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
499 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
500 * is NULL then all outstanding async IO request for the given file
501 * descriptor are cancelled (if possible).
504 aio_cancel(proc_t p
, struct aio_cancel_args
*uap
, int *retval
)
506 struct user_aiocb my_aiocb
;
509 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_START
,
510 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
512 /* quick check to see if there are any async IO requests queued up */
513 if (aio_get_all_queues_count() < 1) {
515 *retval
= AIO_ALLDONE
;
520 if ( uap
->aiocbp
!= USER_ADDR_NULL
) {
521 if ( proc_is64bit(p
) ) {
522 struct user64_aiocb aiocb64
;
524 result
= copyin( uap
->aiocbp
, &aiocb64
, sizeof(aiocb64
) );
526 do_munge_aiocb_user64_to_user(&aiocb64
, &my_aiocb
);
529 struct user32_aiocb aiocb32
;
531 result
= copyin( uap
->aiocbp
, &aiocb32
, sizeof(aiocb32
) );
533 do_munge_aiocb_user32_to_user( &aiocb32
, &my_aiocb
);
541 /* NOTE - POSIX standard says a mismatch between the file */
542 /* descriptor passed in and the file descriptor embedded in */
543 /* the aiocb causes unspecified results. We return EBADF in */
544 /* that situation. */
545 if ( uap
->fd
!= my_aiocb
.aio_fildes
) {
552 result
= do_aio_cancel_locked( p
, uap
->fd
, uap
->aiocbp
, 0, FALSE
);
553 ASSERT_AIO_PROC_LOCK_OWNED(p
);
556 if ( result
!= -1 ) {
565 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_END
,
566 (int)p
, (int)uap
->aiocbp
, result
, 0, 0 );
574 * _aio_close - internal function used to clean up async IO requests for
575 * a file descriptor that is closing.
578 __private_extern__
void
579 _aio_close(proc_t p
, int fd
)
583 /* quick check to see if there are any async IO requests queued up */
584 if (aio_get_all_queues_count() < 1) {
588 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_START
,
589 (int)p
, fd
, 0, 0, 0 );
591 /* cancel all async IO requests on our todo queues for this file descriptor */
593 error
= do_aio_cancel_locked( p
, fd
, 0, AIO_CLOSE_WAIT
, FALSE
);
594 ASSERT_AIO_PROC_LOCK_OWNED(p
);
595 if ( error
== AIO_NOTCANCELED
) {
597 * AIO_NOTCANCELED is returned when we find an aio request for this process
598 * and file descriptor on the active async IO queue. Active requests cannot
599 * be cancelled so we must wait for them to complete. We will get a special
600 * wake up call on our channel used to sleep for ALL active requests to
601 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
602 * when we must wait for all active aio requests.
605 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close_sleep
)) | DBG_FUNC_NONE
,
606 (int)p
, fd
, 0, 0, 0 );
608 while (aio_proc_active_requests_for_file(p
, fd
) > 0) {
609 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
| PDROP
, "aio_close", 0 );
617 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_END
,
618 (int)p
, fd
, 0, 0, 0 );
626 * aio_error - return the error status associated with the async IO
627 * request referred to by uap->aiocbp. The error status is the errno
628 * value that would be set by the corresponding IO request (read, wrtie,
629 * fdatasync, or sync).
632 aio_error(proc_t p
, struct aio_error_args
*uap
, int *retval
)
634 aio_workq_entry
*entryp
;
637 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_START
,
638 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
640 /* see if there are any aios to check */
641 if (aio_get_all_queues_count() < 1) {
647 /* look for a match on our queue of async IO requests that have completed */
648 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
649 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
650 ASSERT_AIO_FROM_PROC(entryp
, p
);
652 aio_entry_lock_spin(entryp
);
653 *retval
= entryp
->errorval
;
655 aio_entry_unlock(entryp
);
656 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_val
)) | DBG_FUNC_NONE
,
657 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
662 /* look for a match on our queue of active async IO requests */
663 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
664 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
665 ASSERT_AIO_FROM_PROC(entryp
, p
);
666 *retval
= EINPROGRESS
;
668 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_activeq
)) | DBG_FUNC_NONE
,
669 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
677 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_END
,
678 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
687 * aio_fsync - asynchronously force all IO operations associated
688 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
689 * queued at the time of the call to the synchronized completion state.
690 * NOTE - we do not support op O_DSYNC at this point since we do not support the
694 aio_fsync(proc_t p
, struct aio_fsync_args
*uap
, int *retval
)
699 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_START
,
700 (int)p
, (int)uap
->aiocbp
, uap
->op
, 0, 0 );
703 /* 0 := O_SYNC for binary backward compatibility with Panther */
704 if (uap
->op
== O_SYNC
|| uap
->op
== 0)
705 fsync_kind
= AIO_FSYNC
;
706 else if ( uap
->op
== O_DSYNC
)
707 fsync_kind
= AIO_DSYNC
;
714 error
= aio_queue_async_request( p
, uap
->aiocbp
, fsync_kind
);
719 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_END
,
720 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
727 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
728 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
729 * (uap->aiocbp->aio_buf).
732 aio_read(proc_t p
, struct aio_read_args
*uap
, int *retval
)
736 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_START
,
737 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
741 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_READ
);
745 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_END
,
746 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
754 * aio_return - return the return status associated with the async IO
755 * request referred to by uap->aiocbp. The return status is the value
756 * that would be returned by corresponding IO request (read, write,
757 * fdatasync, or sync). This is where we release kernel resources
758 * held for async IO call associated with the given aiocb pointer.
761 aio_return(proc_t p
, struct aio_return_args
*uap
, user_ssize_t
*retval
)
763 aio_workq_entry
*entryp
;
765 boolean_t proc_lock_held
= FALSE
;
767 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_START
,
768 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
770 /* See if there are any entries to check */
771 if (aio_get_all_queues_count() < 1) {
777 proc_lock_held
= TRUE
;
780 /* look for a match on our queue of async IO requests that have completed */
781 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
782 ASSERT_AIO_FROM_PROC(entryp
, p
);
783 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
784 /* Done and valid for aio_return(), pull it off the list */
785 aio_proc_remove_done_locked(p
, entryp
);
787 /* Drop the proc lock, but keep the entry locked */
788 aio_entry_lock(entryp
);
790 proc_lock_held
= FALSE
;
792 *retval
= entryp
->returnval
;
795 /* No references and off all lists, safe to free */
796 if (entryp
->aio_refcount
== 0) {
797 aio_entry_unlock(entryp
);
798 aio_free_request(entryp
);
801 /* Whoever has the refcount will have to free it */
802 entryp
->flags
|= AIO_DO_FREE
;
803 aio_entry_unlock(entryp
);
807 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_val
)) | DBG_FUNC_NONE
,
808 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
813 /* look for a match on our queue of active async IO requests */
814 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
815 ASSERT_AIO_FROM_PROC(entryp
, p
);
816 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
818 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_activeq
)) | DBG_FUNC_NONE
,
819 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
829 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_END
,
830 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
838 * _aio_exec - internal function used to clean up async IO requests for
839 * a process that is going away due to exec(). We cancel any async IOs
840 * we can and wait for those already active. We also disable signaling
841 * for cancelled or active aio requests that complete.
842 * This routine MAY block!
844 __private_extern__
void
848 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_START
,
849 (int)p
, 0, 0, 0, 0 );
853 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_END
,
854 (int)p
, 0, 0, 0, 0 );
862 * _aio_exit - internal function used to clean up async IO requests for
863 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
864 * we can and wait for those already active. We also disable signaling
865 * for cancelled or active aio requests that complete. This routine MAY block!
867 __private_extern__
void
871 aio_workq_entry
*entryp
;
874 /* quick check to see if there are any async IO requests queued up */
875 if (aio_get_all_queues_count() < 1) {
879 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_START
,
880 (int)p
, 0, 0, 0, 0 );
885 * cancel async IO requests on the todo work queue and wait for those
886 * already active to complete.
888 error
= do_aio_cancel_locked( p
, 0, 0, AIO_EXIT_WAIT
, TRUE
);
889 ASSERT_AIO_PROC_LOCK_OWNED(p
);
890 if ( error
== AIO_NOTCANCELED
) {
892 * AIO_NOTCANCELED is returned when we find an aio request for this process
893 * on the active async IO queue. Active requests cannot be cancelled so we
894 * must wait for them to complete. We will get a special wake up call on
895 * our channel used to sleep for ALL active requests to complete. This sleep
896 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
897 * active aio requests.
900 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit_sleep
)) | DBG_FUNC_NONE
,
901 (int)p
, 0, 0, 0, 0 );
903 while (p
->p_aio_active_count
!= 0) {
904 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
, "aio_exit", 0 );
908 if (p
->p_aio_active_count
!= 0) {
909 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p
->p_aio_active_count
);
912 /* release all aio resources used by this process */
913 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
914 while ( entryp
!= NULL
) {
915 ASSERT_AIO_FROM_PROC(entryp
, p
);
916 aio_workq_entry
*next_entryp
;
918 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
919 aio_proc_remove_done_locked(p
, entryp
);
921 /* we cannot free requests that are still completing */
922 aio_entry_lock_spin(entryp
);
923 if (entryp
->aio_refcount
== 0) {
925 aio_entry_unlock(entryp
);
926 aio_free_request(entryp
);
928 /* need to start over since aio_doneq may have been */
929 /* changed while we were away. */
931 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
935 /* whoever has the reference will have to do the free */
936 entryp
->flags
|= AIO_DO_FREE
;
939 aio_entry_unlock(entryp
);
940 entryp
= next_entryp
;
945 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_END
,
946 (int)p
, 0, 0, 0, 0 );
953 should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
)
955 if ( (aiocbp
== USER_ADDR_NULL
&& fd
== 0) ||
956 (aiocbp
!= USER_ADDR_NULL
&& entryp
->uaiocbp
== aiocbp
) ||
957 (aiocbp
== USER_ADDR_NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
965 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
966 * aio_cancel, close, and at exit.
967 * There are three modes of operation: 1) cancel all async IOs for a process -
968 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
969 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
971 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
972 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
973 * target async IO requests, and AIO_ALLDONE if all target async IO requests
974 * were already complete.
975 * WARNING - do not deference aiocbp in this routine, it may point to user
976 * land data that has not been copied in (when called from aio_cancel() )
978 * Called with proc locked, and returns the same way.
981 do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
,
982 int wait_for_completion
, boolean_t disable_notification
)
984 ASSERT_AIO_PROC_LOCK_OWNED(p
);
986 aio_workq_entry
*entryp
;
991 /* look for a match on our queue of async todo work. */
992 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
993 while ( entryp
!= NULL
) {
994 ASSERT_AIO_FROM_PROC(entryp
, p
);
995 aio_workq_entry
*next_entryp
;
997 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
998 if (!should_cancel(entryp
, aiocbp
, fd
)) {
999 entryp
= next_entryp
;
1003 /* Can only be cancelled if it's still on a work queue */
1004 if (aio_entry_try_workq_remove(entryp
) != 0) {
1005 /* Have removed from workq. Update entry state and take a ref */
1006 aio_entry_update_for_cancel(entryp
, TRUE
, 0, disable_notification
);
1008 /* Put on the proc done queue and update counts, then unlock the proc */
1009 aio_proc_move_done_locked(p
, entryp
);
1012 /* Now it's officially cancelled. Do the completion */
1013 result
= AIO_CANCELED
;
1014 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_async_workq
)) | DBG_FUNC_NONE
,
1015 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1016 do_aio_completion(entryp
);
1018 /* This will free if the aio_return() has already happened ... */
1019 aio_entry_unref(entryp
);
1022 if ( aiocbp
!= USER_ADDR_NULL
) {
1027 * Restart from the head of the proc active queue since it
1028 * may have been changed while we were away doing completion
1031 * Note that if we found an uncancellable AIO before, we will
1032 * either find it again or discover that it's been completed,
1033 * so resetting the result will not cause us to return success
1034 * despite outstanding AIOs.
1036 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
1037 result
= -1; /* As if beginning anew */
1040 * It's been taken off the active queue already, i.e. is in flight.
1041 * All we can do is ask for notification.
1043 result
= AIO_NOTCANCELED
;
1045 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_activeq
)) | DBG_FUNC_NONE
,
1046 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1048 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1049 aio_entry_update_for_cancel(entryp
, FALSE
, wait_for_completion
, disable_notification
);
1051 if ( aiocbp
!= USER_ADDR_NULL
) {
1054 entryp
= next_entryp
;
1059 * if we didn't find any matches on the todo or active queues then look for a
1060 * match on our queue of async IO requests that have completed and if found
1061 * return AIO_ALLDONE result.
1063 * Proc AIO lock is still held.
1065 if ( result
== -1 ) {
1066 TAILQ_FOREACH(entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1067 ASSERT_AIO_FROM_PROC(entryp
, p
);
1068 if (should_cancel(entryp
, aiocbp
, fd
)) {
1069 result
= AIO_ALLDONE
;
1070 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_doneq
)) | DBG_FUNC_NONE
,
1071 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1073 if ( aiocbp
!= USER_ADDR_NULL
) {
1083 /* do_aio_cancel_locked */
1087 * aio_suspend - suspend the calling thread until at least one of the async
1088 * IO operations referenced by uap->aiocblist has completed, until a signal
1089 * interrupts the function, or uap->timeoutp time interval (optional) has
1091 * Returns 0 if one or more async IOs have completed else -1 and errno is
1092 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1096 aio_suspend(proc_t p
, struct aio_suspend_args
*uap
, int *retval
)
1098 __pthread_testcancel(1);
1099 return(aio_suspend_nocancel(p
, (struct aio_suspend_nocancel_args
*)uap
, retval
));
1104 aio_suspend_nocancel(proc_t p
, struct aio_suspend_nocancel_args
*uap
, int *retval
)
1109 struct user_timespec ts
;
1110 aio_workq_entry
*entryp
;
1111 user_addr_t
*aiocbpp
;
1113 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_START
,
1114 (int)p
, uap
->nent
, 0, 0, 0 );
1120 count
= aio_get_all_queues_count( );
1123 goto ExitThisRoutine
;
1126 if ( uap
->nent
< 1 || uap
->nent
> aio_max_requests_per_process
) {
1128 goto ExitThisRoutine
;
1131 if ( uap
->timeoutp
!= USER_ADDR_NULL
) {
1132 if ( proc_is64bit(p
) ) {
1133 struct user64_timespec temp
;
1134 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
) );
1136 ts
.tv_sec
= temp
.tv_sec
;
1137 ts
.tv_nsec
= temp
.tv_nsec
;
1141 struct user32_timespec temp
;
1142 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
) );
1144 ts
.tv_sec
= temp
.tv_sec
;
1145 ts
.tv_nsec
= temp
.tv_nsec
;
1150 goto ExitThisRoutine
;
1153 if ( ts
.tv_sec
< 0 || ts
.tv_nsec
< 0 || ts
.tv_nsec
>= 1000000000 ) {
1155 goto ExitThisRoutine
;
1158 nanoseconds_to_absolutetime( (uint64_t)ts
.tv_sec
* NSEC_PER_SEC
+ ts
.tv_nsec
,
1160 clock_absolutetime_interval_to_deadline( abstime
, &abstime
);
1163 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1164 if ( aiocbpp
== NULL
) {
1166 goto ExitThisRoutine
;
1169 /* check list of aio requests to see if any have completed */
1170 check_for_our_aiocbp
:
1171 aio_proc_lock_spin(p
);
1172 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1175 /* NULL elements are legal so check for 'em */
1176 aiocbp
= *(aiocbpp
+ i
);
1177 if ( aiocbp
== USER_ADDR_NULL
)
1180 /* return immediately if any aio request in the list is done */
1181 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1182 ASSERT_AIO_FROM_PROC(entryp
, p
);
1183 if ( entryp
->uaiocbp
== aiocbp
) {
1187 goto ExitThisRoutine
;
1190 } /* for ( ; i < uap->nent; ) */
1192 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend_sleep
)) | DBG_FUNC_NONE
,
1193 (int)p
, uap
->nent
, 0, 0, 0 );
1196 * wait for an async IO to complete or a signal fires or timeout expires.
1197 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1198 * interrupts us. If an async IO completes before a signal fires or our
1199 * timeout expires, we get a wakeup call from aio_work_thread().
1202 error
= msleep1(&p
->AIO_SUSPEND_SLEEP_CHAN
, aio_proc_mutex(p
), PCATCH
| PWAIT
| PDROP
, "aio_suspend", abstime
); /* XXX better priority? */
1205 * got our wakeup call from aio_work_thread().
1206 * Since we can get a wakeup on this channel from another thread in the
1207 * same process we head back up to make sure this is for the correct aiocbp.
1208 * If it is the correct aiocbp we will return from where we do the check
1209 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1210 * else we will fall out and just sleep again.
1212 goto check_for_our_aiocbp
;
1214 else if ( error
== EWOULDBLOCK
) {
1215 /* our timeout expired */
1219 /* we were interrupted */
1224 if ( aiocbpp
!= NULL
)
1225 FREE( aiocbpp
, M_TEMP
);
1227 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_END
,
1228 (int)p
, uap
->nent
, error
, 0, 0 );
1235 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1236 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1237 * (uap->aiocbp->aio_buf).
1241 aio_write(proc_t p
, struct aio_write_args
*uap
, int *retval
)
1247 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_START
,
1248 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
1250 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_WRITE
);
1254 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_END
,
1255 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
1262 static user_addr_t
*
1263 aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
)
1265 user_addr_t
*aiocbpp
;
1268 /* we reserve enough space for largest possible pointer size */
1269 MALLOC( aiocbpp
, user_addr_t
*, (nent
* sizeof(user_addr_t
)), M_TEMP
, M_WAITOK
);
1270 if ( aiocbpp
== NULL
)
1273 /* copyin our aiocb pointers from list */
1274 result
= copyin( aiocblist
, aiocbpp
,
1275 proc_is64bit(procp
) ? (nent
* sizeof(user64_addr_t
))
1276 : (nent
* sizeof(user32_addr_t
)) );
1278 FREE( aiocbpp
, M_TEMP
);
1284 * We depend on a list of user_addr_t's so we need to
1285 * munge and expand when these pointers came from a
1288 if ( !proc_is64bit(procp
) ) {
1289 /* copy from last to first to deal with overlap */
1290 user32_addr_t
*my_ptrp
= ((user32_addr_t
*)aiocbpp
) + (nent
- 1);
1291 user_addr_t
*my_addrp
= aiocbpp
+ (nent
- 1);
1293 for (i
= 0; i
< nent
; i
++, my_ptrp
--, my_addrp
--) {
1294 *my_addrp
= (user_addr_t
) (*my_ptrp
);
1304 aio_copy_in_sigev(proc_t procp
, user_addr_t sigp
, struct user_sigevent
*sigev
)
1308 if (sigp
== USER_ADDR_NULL
)
1312 * We need to munge aio_sigevent since it contains pointers.
1313 * Since we do not know if sigev_value is an int or a ptr we do
1314 * NOT cast the ptr to a user_addr_t. This means if we send
1315 * this info back to user space we need to remember sigev_value
1316 * was not expanded for the 32-bit case.
1318 * Notes: This does NOT affect us since we don't support
1319 * sigev_value yet in the aio context.
1321 if ( proc_is64bit(procp
) ) {
1322 struct user64_sigevent sigevent64
;
1324 result
= copyin( sigp
, &sigevent64
, sizeof(sigevent64
) );
1325 if ( result
== 0 ) {
1326 sigev
->sigev_notify
= sigevent64
.sigev_notify
;
1327 sigev
->sigev_signo
= sigevent64
.sigev_signo
;
1328 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent64
.sigev_value
.size_equivalent
.sival_int
;
1329 sigev
->sigev_notify_function
= sigevent64
.sigev_notify_function
;
1330 sigev
->sigev_notify_attributes
= sigevent64
.sigev_notify_attributes
;
1334 struct user32_sigevent sigevent32
;
1336 result
= copyin( sigp
, &sigevent32
, sizeof(sigevent32
) );
1337 if ( result
== 0 ) {
1338 sigev
->sigev_notify
= sigevent32
.sigev_notify
;
1339 sigev
->sigev_signo
= sigevent32
.sigev_signo
;
1340 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent32
.sigev_value
.sival_int
;
1341 sigev
->sigev_notify_function
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_function
);
1342 sigev
->sigev_notify_attributes
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_attributes
);
1346 if ( result
!= 0 ) {
1357 * Queue up the entry on the aio asynchronous work queue in priority order
1358 * based on the relative priority of the request. We calculate the relative
1359 * priority using the nice value of the caller and the value
1361 * Parameters: procp Process queueing the I/O
1362 * entryp The work queue entry being queued
1364 * Returns: (void) No failure modes
1366 * Notes: This function is used for both lio_listio and aio
1368 * XXX: At some point, we may have to consider thread priority
1369 * rather than process priority, but we don't maintain the
1370 * adjusted priority for threads the POSIX way.
1373 * Called with proc locked.
1376 aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
)
1379 aio_workq_entry
*my_entryp
; /* used for insertion sort */
1381 aio_workq_t queue
= aio_entry_workq(entryp
);
1383 if (proc_locked
== 0) {
1384 aio_proc_lock(procp
);
1387 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
1389 /* Onto proc queue */
1390 TAILQ_INSERT_TAIL(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
1391 procp
->p_aio_active_count
++;
1392 procp
->p_aio_total_count
++;
1394 /* And work queue */
1395 aio_workq_lock_spin(queue
);
1396 aio_workq_add_entry_locked(queue
, entryp
);
1397 wait_queue_wakeup_one(queue
->aioq_waitq
, queue
, THREAD_AWAKENED
);
1398 aio_workq_unlock(queue
);
1400 if (proc_locked
== 0) {
1401 aio_proc_unlock(procp
);
1408 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1409 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1410 * which is [0..39], with 0 not being used. In nice values, the
1411 * lower the nice value, the higher the priority.
1412 * (3) The normalized scheduling prioritiy is the highest nice value
1413 * minus the current nice value. In I/O scheduling priority, the
1414 * higher the value the lower the priority, so it is the inverse
1415 * of the nice value (the higher the number, the higher the I/O
1417 * (4) From the normalized scheduling priority, we subtract the
1418 * request priority to get the request priority value number;
1419 * this means that requests are only capable of depressing their
1420 * priority relative to other requests,
1422 entryp
->priority
= (((2 * NZERO
) - 1) - procp
->p_nice
);
1424 /* only premit depressing the priority */
1425 if (entryp
->aiocb
.aio_reqprio
< 0)
1426 entryp
->aiocb
.aio_reqprio
= 0;
1427 if (entryp
->aiocb
.aio_reqprio
> 0) {
1428 entryp
->priority
-= entryp
->aiocb
.aio_reqprio
;
1429 if (entryp
->priority
< 0)
1430 entryp
->priority
= 0;
1433 /* Insertion sort the entry; lowest ->priority to highest */
1434 TAILQ_FOREACH(my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1435 if ( entryp
->priority
<= my_entryp
->priority
) {
1436 TAILQ_INSERT_BEFORE(my_entryp
, entryp
, aio_workq_link
);
1440 if (my_entryp
== NULL
)
1441 TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1447 * lio_listio - initiate a list of IO requests. We process the list of
1448 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1449 * (mode == LIO_NOWAIT).
1451 * The caller gets error and return status for each aiocb in the list
1452 * via aio_error and aio_return. We must keep completed requests until
1453 * released by the aio_return call.
1456 lio_listio(proc_t p
, struct lio_listio_args
*uap
, int *retval
)
1462 aio_workq_entry
**entryp_listp
;
1463 user_addr_t
*aiocbpp
;
1464 struct user_sigevent aiosigev
;
1465 aio_lio_context
*lio_context
;
1466 boolean_t free_context
= FALSE
;
1468 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_START
,
1469 (int)p
, uap
->nent
, uap
->mode
, 0, 0 );
1471 entryp_listp
= NULL
;
1476 if ( !(uap
->mode
== LIO_NOWAIT
|| uap
->mode
== LIO_WAIT
) ) {
1477 call_result
= EINVAL
;
1481 if ( uap
->nent
< 1 || uap
->nent
> AIO_LISTIO_MAX
) {
1482 call_result
= EINVAL
;
1487 * allocate a list of aio_workq_entry pointers that we will use
1488 * to queue up all our requests at once while holding our lock.
1490 MALLOC( entryp_listp
, void *, (uap
->nent
* sizeof(aio_workq_entry
*)), M_TEMP
, M_WAITOK
);
1491 if ( entryp_listp
== NULL
) {
1492 call_result
= EAGAIN
;
1496 MALLOC( lio_context
, aio_lio_context
*, sizeof(aio_lio_context
), M_TEMP
, M_WAITOK
);
1497 if ( lio_context
== NULL
) {
1498 call_result
= EAGAIN
;
1503 OSIncrementAtomic(&lio_contexts_alloced
);
1506 bzero(lio_context
, sizeof(aio_lio_context
));
1508 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1509 if ( aiocbpp
== NULL
) {
1510 call_result
= EAGAIN
;
1515 * Use sigevent passed in to lio_listio for each of our calls, but
1516 * only do completion notification after the last request completes.
1518 bzero(&aiosigev
, sizeof(aiosigev
));
1519 /* Only copy in an sigev if the user supplied one */
1520 if (uap
->sigp
!= USER_ADDR_NULL
) {
1521 call_result
= aio_copy_in_sigev(p
, uap
->sigp
, &aiosigev
);
1526 /* process list of aio requests */
1527 lio_context
->io_issued
= uap
->nent
;
1528 lio_context
->io_waiter
= uap
->mode
== LIO_WAIT
? 1 : 0; /* Should it be freed by last AIO */
1529 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1530 user_addr_t my_aiocbp
;
1531 aio_workq_entry
*entryp
;
1533 *(entryp_listp
+ i
) = NULL
;
1534 my_aiocbp
= *(aiocbpp
+ i
);
1536 /* NULL elements are legal so check for 'em */
1537 if ( my_aiocbp
== USER_ADDR_NULL
) {
1538 aio_proc_lock_spin(p
);
1539 lio_context
->io_issued
--;
1545 * We use lio_context to mark IO requests for delayed completion
1546 * processing which means we wait until all IO requests in the
1547 * group have completed before we either return to the caller
1548 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1550 * We use the address of the lio_context for this, since it is
1551 * unique in the address space.
1553 result
= lio_create_entry( p
, my_aiocbp
, lio_context
, (entryp_listp
+ i
) );
1554 if ( result
!= 0 && call_result
== -1 )
1555 call_result
= result
;
1557 /* NULL elements are legal so check for 'em */
1558 entryp
= *(entryp_listp
+ i
);
1559 if ( entryp
== NULL
) {
1560 aio_proc_lock_spin(p
);
1561 lio_context
->io_issued
--;
1566 if ( uap
->mode
== LIO_NOWAIT
) {
1567 /* Set signal hander, if any */
1568 entryp
->aiocb
.aio_sigevent
= aiosigev
;
1570 /* flag that this thread blocks pending completion */
1571 entryp
->flags
|= AIO_LIO_NOTIFY
;
1574 /* check our aio limits to throttle bad or rude user land behavior */
1575 old_count
= aio_increment_total_count();
1577 aio_proc_lock_spin(p
);
1578 if ( old_count
>= aio_max_requests
||
1579 aio_get_process_count( entryp
->procp
) >= aio_max_requests_per_process
||
1580 is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1582 lio_context
->io_issued
--;
1585 aio_decrement_total_count();
1587 if ( call_result
== -1 )
1588 call_result
= EAGAIN
;
1589 aio_free_request(entryp
);
1590 entryp_listp
[i
] = NULL
;
1594 lck_mtx_convert_spin(aio_proc_mutex(p
));
1595 aio_enqueue_work(p
, entryp
, 1);
1598 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1599 (int)p
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1604 aio_proc_lock_spin(p
);
1605 while (lio_context
->io_completed
< lio_context
->io_issued
) {
1606 result
= msleep(lio_context
, aio_proc_mutex(p
), PCATCH
| PRIBIO
| PSPIN
, "lio_listio", 0);
1608 /* If we were interrupted, fail out (even if all finished) */
1610 call_result
= EINTR
;
1611 lio_context
->io_waiter
= 0;
1616 /* If all IOs have finished must free it */
1617 if (lio_context
->io_completed
== lio_context
->io_issued
) {
1618 free_context
= TRUE
;
1628 /* call_result == -1 means we had no trouble queueing up requests */
1629 if ( call_result
== -1 ) {
1635 if ( entryp_listp
!= NULL
)
1636 FREE( entryp_listp
, M_TEMP
);
1637 if ( aiocbpp
!= NULL
)
1638 FREE( aiocbpp
, M_TEMP
);
1639 if ((lio_context
!= NULL
) && ((lio_context
->io_issued
== 0) || (free_context
== TRUE
))) {
1640 free_lio_context(lio_context
);
1643 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_END
,
1644 (int)p
, call_result
, 0, 0, 0 );
1646 return( call_result
);
1652 * aio worker thread. this is where all the real work gets done.
1653 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1654 * after new work is queued up.
1657 aio_work_thread( void )
1659 aio_workq_entry
*entryp
;
1661 vm_map_t currentmap
;
1662 vm_map_t oldmap
= VM_MAP_NULL
;
1663 task_t oldaiotask
= TASK_NULL
;
1664 struct uthread
*uthreadp
= NULL
;
1668 * returns with the entry ref'ed.
1669 * sleeps until work is available.
1671 entryp
= aio_get_some_work();
1673 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_START
,
1674 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->flags
, 0, 0 );
1677 * Assume the target's address space identity for the duration
1678 * of the IO. Note: don't need to have the entryp locked,
1679 * because the proc and map don't change until it's freed.
1681 currentmap
= get_task_map( (current_proc())->task
);
1682 if ( currentmap
!= entryp
->aio_map
) {
1683 uthreadp
= (struct uthread
*) get_bsdthread_info(current_thread());
1684 oldaiotask
= uthreadp
->uu_aio_task
;
1685 uthreadp
->uu_aio_task
= entryp
->procp
->task
;
1686 oldmap
= vm_map_switch( entryp
->aio_map
);
1689 if ( (entryp
->flags
& AIO_READ
) != 0 ) {
1690 error
= do_aio_read( entryp
);
1692 else if ( (entryp
->flags
& AIO_WRITE
) != 0 ) {
1693 error
= do_aio_write( entryp
);
1695 else if ( (entryp
->flags
& (AIO_FSYNC
| AIO_DSYNC
)) != 0 ) {
1696 error
= do_aio_fsync( entryp
);
1699 printf( "%s - unknown aio request - flags 0x%02X \n",
1700 __FUNCTION__
, entryp
->flags
);
1704 /* Restore old map */
1705 if ( currentmap
!= entryp
->aio_map
) {
1706 (void) vm_map_switch( oldmap
);
1707 uthreadp
->uu_aio_task
= oldaiotask
;
1710 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_END
,
1711 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->errorval
,
1712 entryp
->returnval
, 0 );
1716 aio_entry_lock_spin(entryp
);
1717 entryp
->errorval
= error
;
1718 aio_entry_unlock(entryp
);
1720 /* we're done with the IO request so pop it off the active queue and */
1721 /* push it on the done queue */
1722 aio_proc_lock(entryp
->procp
);
1723 aio_proc_move_done_locked(entryp
->procp
, entryp
);
1724 aio_proc_unlock(entryp
->procp
);
1726 OSDecrementAtomic(&aio_anchor
.aio_inflight_count
);
1728 /* remove our reference to the user land map. */
1729 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
1732 my_map
= entryp
->aio_map
;
1733 entryp
->aio_map
= VM_MAP_NULL
;
1734 vm_map_deallocate( my_map
);
1737 /* Provide notifications */
1738 do_aio_completion( entryp
);
1740 /* Will free if needed */
1741 aio_entry_unref(entryp
);
1747 } /* aio_work_thread */
1751 * aio_get_some_work - get the next async IO request that is ready to be executed.
1752 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1753 * IO requests at the time the aio_fsync call came in have completed.
1754 * NOTE - AIO_LOCK must be held by caller
1756 static aio_workq_entry
*
1757 aio_get_some_work( void )
1759 aio_workq_entry
*entryp
= NULL
;
1760 aio_workq_t queue
= NULL
;
1762 /* Just one queue for the moment. In the future there will be many. */
1763 queue
= &aio_anchor
.aio_async_workqs
[0];
1764 aio_workq_lock_spin(queue
);
1765 if (queue
->aioq_count
== 0) {
1770 * Hold the queue lock.
1772 * pop some work off the work queue and add to our active queue
1773 * Always start with the queue lock held.
1777 * Pull of of work queue. Once it's off, it can't be cancelled,
1778 * so we can take our ref once we drop the queue lock.
1780 entryp
= TAILQ_FIRST(&queue
->aioq_entries
);
1783 * If there's no work or only fsyncs that need delay, go to sleep
1784 * and then start anew from aio_work_thread
1786 if (entryp
== NULL
) {
1790 aio_workq_remove_entry_locked(queue
, entryp
);
1792 aio_workq_unlock(queue
);
1795 * Check if it's an fsync that must be delayed. No need to lock the entry;
1796 * that flag would have been set at initialization.
1798 if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1800 * Check for unfinished operations on the same file
1801 * in this proc's queue.
1803 aio_proc_lock_spin(entryp
->procp
);
1804 if ( aio_delay_fsync_request( entryp
) ) {
1805 /* It needs to be delayed. Put it back on the end of the work queue */
1806 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync_delay
)) | DBG_FUNC_NONE
,
1807 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1809 aio_proc_unlock(entryp
->procp
);
1811 aio_workq_lock_spin(queue
);
1812 aio_workq_add_entry_locked(queue
, entryp
);
1815 aio_proc_unlock(entryp
->procp
);
1821 aio_entry_ref(entryp
);
1823 OSIncrementAtomic(&aio_anchor
.aio_inflight_count
);
1827 /* We will wake up when someone enqueues something */
1828 wait_queue_assert_wait(queue
->aioq_waitq
, queue
, THREAD_UNINT
, 0);
1829 aio_workq_unlock(queue
);
1830 thread_block( (thread_continue_t
)aio_work_thread
);
1837 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1838 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1839 * not been completed.
1842 aio_delay_fsync_request( aio_workq_entry
*entryp
)
1844 if (entryp
== TAILQ_FIRST(&entryp
->procp
->p_aio_activeq
)) {
1849 } /* aio_delay_fsync_request */
1851 static aio_workq_entry
*
1852 aio_create_queue_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
, int kindOfIO
)
1854 aio_workq_entry
*entryp
;
1857 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1858 if ( entryp
== NULL
) {
1863 bzero( entryp
, sizeof(*entryp
) );
1865 /* fill in the rest of the aio_workq_entry */
1866 entryp
->procp
= procp
;
1867 entryp
->uaiocbp
= aiocbp
;
1868 entryp
->flags
|= kindOfIO
;
1869 entryp
->group_tag
= group_tag
;
1870 entryp
->aio_map
= VM_MAP_NULL
;
1871 entryp
->aio_refcount
= 0;
1873 if ( proc_is64bit(procp
) ) {
1874 struct user64_aiocb aiocb64
;
1876 result
= copyin( aiocbp
, &aiocb64
, sizeof(aiocb64
) );
1878 do_munge_aiocb_user64_to_user(&aiocb64
, &entryp
->aiocb
);
1881 struct user32_aiocb aiocb32
;
1883 result
= copyin( aiocbp
, &aiocb32
, sizeof(aiocb32
) );
1885 do_munge_aiocb_user32_to_user( &aiocb32
, &entryp
->aiocb
);
1888 if ( result
!= 0 ) {
1893 /* get a reference to the user land map in order to keep it around */
1894 entryp
->aio_map
= get_task_map( procp
->task
);
1895 vm_map_reference( entryp
->aio_map
);
1897 /* do some more validation on the aiocb and embedded file descriptor */
1898 result
= aio_validate( entryp
);
1901 if ( result
&& entryp
!= NULL
) {
1902 zfree( aio_workq_zonep
, entryp
);
1911 * aio_queue_async_request - queue up an async IO request on our work queue then
1912 * wake up one of our worker threads to do the actual work. We get a reference
1913 * to our caller's user land map in order to keep it around while we are
1914 * processing the request.
1917 aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
)
1919 aio_workq_entry
*entryp
;
1923 old_count
= aio_increment_total_count();
1924 if (old_count
>= aio_max_requests
) {
1929 entryp
= aio_create_queue_entry( procp
, aiocbp
, 0, kindOfIO
);
1930 if ( entryp
== NULL
) {
1936 aio_proc_lock_spin(procp
);
1938 if ( is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1943 /* check our aio limits to throttle bad or rude user land behavior */
1944 if (aio_get_process_count( procp
) >= aio_max_requests_per_process
) {
1945 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp
->p_aio_total_count
);
1950 /* Add the IO to proc and work queues, wake up threads as appropriate */
1951 lck_mtx_convert_spin(aio_proc_mutex(procp
));
1952 aio_enqueue_work(procp
, entryp
, 1);
1954 aio_proc_unlock(procp
);
1956 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1957 (int)procp
, (int)aiocbp
, 0, 0, 0 );
1963 * This entry has not been queued up so no worries about
1964 * unlocked state and aio_map
1966 aio_proc_unlock(procp
);
1967 aio_free_request(entryp
);
1970 aio_decrement_total_count();
1974 } /* aio_queue_async_request */
1980 * Allocate an aio_workq_entry and fill it in. If all goes well return 0
1981 * and pass the aio_workq_entry pointer back to our caller.
1983 * Parameters: procp The process makign the request
1984 * aiocbp The aio context buffer pointer
1985 * group_tag The group tag used to indicate a
1986 * group of operations has completed
1987 * entrypp Pointer to the pointer to receive the
1988 * address of the created aio_workq_entry
1990 * Returns: 0 Successfully created
1991 * EAGAIN Try again (usually resource shortage)
1994 * Notes: We get a reference to our caller's user land map in order
1995 * to keep it around while we are processing the request.
1997 * lio_listio calls behave differently at completion they do
1998 * completion notification when all async IO requests have
1999 * completed. We use group_tag to tag IO requests that behave
2000 * in the delay notification manner.
2002 * All synchronous operations are considered to not have a
2003 * signal routine associated with them (sigp == USER_ADDR_NULL).
2006 lio_create_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
,
2007 aio_workq_entry
**entrypp
)
2009 aio_workq_entry
*entryp
;
2012 entryp
= aio_create_queue_entry( procp
, aiocbp
, group_tag
, AIO_LIO
);
2013 if ( entryp
== NULL
) {
2019 * Look for lio_listio LIO_NOP requests and ignore them; this is
2020 * not really an error, but we need to free our aio_workq_entry.
2022 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
2032 if ( entryp
!= NULL
) {
2034 * This entry has not been queued up so no worries about
2035 * unlocked state and aio_map
2037 aio_free_request(entryp
);
2042 } /* lio_create_entry */
2046 * aio_free_request - remove our reference on the user land map and
2047 * free the work queue entry resources. The entry is off all lists
2048 * and has zero refcount, so no one can have a pointer to it.
2052 aio_free_request(aio_workq_entry
*entryp
)
2054 /* remove our reference to the user land map. */
2055 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
2056 vm_map_deallocate(entryp
->aio_map
);
2059 entryp
->aio_refcount
= -1; /* A bit of poisoning in case of bad refcounting. */
2061 zfree( aio_workq_zonep
, entryp
);
2065 } /* aio_free_request */
2071 * validate the aiocb passed in by one of the aio syscalls.
2074 aio_validate( aio_workq_entry
*entryp
)
2076 struct fileproc
*fp
;
2082 if ( (entryp
->flags
& AIO_LIO
) != 0 ) {
2083 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_READ
)
2084 entryp
->flags
|= AIO_READ
;
2085 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_WRITE
)
2086 entryp
->flags
|= AIO_WRITE
;
2087 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
)
2094 if ( (entryp
->flags
& (AIO_WRITE
| AIO_FSYNC
| AIO_DSYNC
)) != 0 ) {
2098 if ( (entryp
->flags
& (AIO_READ
| AIO_WRITE
)) != 0 ) {
2099 if ( entryp
->aiocb
.aio_nbytes
> INT_MAX
||
2100 entryp
->aiocb
.aio_buf
== USER_ADDR_NULL
||
2101 entryp
->aiocb
.aio_offset
< 0 )
2106 * validate aiocb.aio_sigevent. at this point we only support
2107 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
2108 * sigev_value, sigev_notify_function, and sigev_notify_attributes
2109 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
2110 * with no [RTS] (RalTime Signal) option group support.
2112 switch ( entryp
->aiocb
.aio_sigevent
.sigev_notify
) {
2117 /* make sure we have a valid signal number */
2118 signum
= entryp
->aiocb
.aio_sigevent
.sigev_signo
;
2119 if ( signum
<= 0 || signum
>= NSIG
||
2120 signum
== SIGKILL
|| signum
== SIGSTOP
)
2129 /* Unsupported [RTS] */
2135 /* validate the file descriptor and that the file was opened
2136 * for the appropriate read / write access.
2138 proc_fdlock(entryp
->procp
);
2140 result
= fp_lookup( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 1);
2141 if ( result
== 0 ) {
2142 if ( (fp
->f_fglob
->fg_flag
& flag
) == 0 ) {
2143 /* we don't have read or write access */
2146 else if ( fp
->f_fglob
->fg_type
!= DTYPE_VNODE
) {
2147 /* this is not a file */
2150 fp
->f_flags
|= FP_AIOISSUED
;
2152 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 1);
2158 proc_fdunlock(entryp
->procp
);
2162 } /* aio_validate */
2165 aio_increment_total_count()
2167 return OSIncrementAtomic(&aio_anchor
.aio_total_count
);
2171 aio_decrement_total_count()
2173 int old
= OSDecrementAtomic(&aio_anchor
.aio_total_count
);
2175 panic("Negative total AIO count!\n");
2182 aio_get_process_count(proc_t procp
)
2184 return procp
->p_aio_total_count
;
2186 } /* aio_get_process_count */
2189 aio_get_all_queues_count( void )
2191 return aio_anchor
.aio_total_count
;
2193 } /* aio_get_all_queues_count */
2197 * do_aio_completion. Handle async IO completion.
2200 do_aio_completion( aio_workq_entry
*entryp
)
2203 boolean_t lastLioCompleted
= FALSE
;
2204 aio_lio_context
*lio_context
= NULL
;
2207 lio_context
= (aio_lio_context
*)entryp
->group_tag
;
2209 if (lio_context
!= NULL
) {
2211 aio_proc_lock_spin(entryp
->procp
);
2213 /* Account for this I/O completing. */
2214 lio_context
->io_completed
++;
2216 /* Are we done with this lio context? */
2217 if (lio_context
->io_issued
== lio_context
->io_completed
) {
2218 lastLioCompleted
= TRUE
;
2221 waiter
= lio_context
->io_waiter
;
2223 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2224 if ((entryp
->flags
& AIO_LIO_NOTIFY
) && (lastLioCompleted
) && (waiter
!= 0)) {
2225 /* wake up the waiter */
2226 wakeup(lio_context
);
2229 aio_proc_unlock(entryp
->procp
);
2232 if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
== SIGEV_SIGNAL
&&
2233 (entryp
->flags
& AIO_DISABLE
) == 0 ) {
2235 boolean_t performSignal
= FALSE
;
2236 if (lio_context
== NULL
) {
2237 performSignal
= TRUE
;
2241 * If this was the last request in the group and a signal
2242 * is desired, send one.
2244 performSignal
= lastLioCompleted
;
2247 if (performSignal
) {
2249 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_sig
)) | DBG_FUNC_NONE
,
2250 (int)entryp
->procp
, (int)entryp
->uaiocbp
,
2251 entryp
->aiocb
.aio_sigevent
.sigev_signo
, 0, 0 );
2253 psignal( entryp
->procp
, entryp
->aiocb
.aio_sigevent
.sigev_signo
);
2257 if ((entryp
->flags
& AIO_EXIT_WAIT
) && (entryp
->flags
& AIO_CLOSE_WAIT
)) {
2258 panic("Close and exit flags set at the same time\n");
2262 * need to handle case where a process is trying to exit, exec, or
2263 * close and is currently waiting for active aio requests to complete.
2264 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2265 * other requests in the active queue for this process. If there are
2266 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2267 * If there are some still active then do nothing - we only want to
2268 * wakeup when all active aio requests for the process are complete.
2270 * Don't need to lock the entry or proc to check the cleanup flag. It can only be
2271 * set for cancellation, while the entryp is still on a proc list; now it's
2272 * off, so that flag is already set if it's going to be.
2274 if ( (entryp
->flags
& AIO_EXIT_WAIT
) != 0 ) {
2275 int active_requests
;
2277 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2278 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2280 aio_proc_lock_spin(entryp
->procp
);
2281 active_requests
= aio_active_requests_for_process( entryp
->procp
);
2282 if ( active_requests
< 1 ) {
2284 * no active aio requests for this process, continue exiting. In this
2285 * case, there should be no one else waiting ont he proc in AIO...
2287 wakeup_one((caddr_t
)&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2288 aio_proc_unlock(entryp
->procp
);
2290 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2291 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2293 aio_proc_unlock(entryp
->procp
);
2297 if ( (entryp
->flags
& AIO_CLOSE_WAIT
) != 0 ) {
2298 int active_requests
;
2300 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2301 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2303 aio_proc_lock_spin(entryp
->procp
);
2304 active_requests
= aio_proc_active_requests_for_file( entryp
->procp
, entryp
->aiocb
.aio_fildes
);
2305 if ( active_requests
< 1 ) {
2306 /* Can't wakeup_one(); multiple closes might be in progress. */
2307 wakeup(&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2308 aio_proc_unlock(entryp
->procp
);
2310 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2311 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2313 aio_proc_unlock(entryp
->procp
);
2317 * A thread in aio_suspend() wants to known about completed IOs. If it checked
2318 * the done list before we moved our AIO there, then it already asserted its wait,
2319 * and we can wake it up without holding the lock. If it checked the list after
2320 * we did our move, then it already has seen the AIO that we moved. Herego, we
2321 * can do our wakeup without holding the lock.
2323 wakeup( (caddr_t
) &entryp
->procp
->AIO_SUSPEND_SLEEP_CHAN
);
2324 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_suspend_wake
)) | DBG_FUNC_NONE
,
2325 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2328 * free the LIO context if the last lio completed and no thread is
2331 if (lastLioCompleted
&& (waiter
== 0))
2332 free_lio_context (lio_context
);
2335 } /* do_aio_completion */
2342 do_aio_read( aio_workq_entry
*entryp
)
2344 struct fileproc
*fp
;
2346 struct vfs_context context
;
2348 if ( (error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0)) )
2350 if ( (fp
->f_fglob
->fg_flag
& FREAD
) == 0 ) {
2351 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2357 * Needs vfs_context_t from vfs_context_create() in entryp!
2359 context
.vc_thread
= proc_thread(entryp
->procp
); /* XXX */
2360 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2362 error
= dofileread(&context
, fp
,
2363 entryp
->aiocb
.aio_buf
,
2364 entryp
->aiocb
.aio_nbytes
,
2365 entryp
->aiocb
.aio_offset
, FOF_OFFSET
,
2366 &entryp
->returnval
);
2367 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2378 do_aio_write( aio_workq_entry
*entryp
)
2380 struct fileproc
*fp
;
2382 struct vfs_context context
;
2384 if ( (error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0)) )
2386 if ( (fp
->f_fglob
->fg_flag
& FWRITE
) == 0 ) {
2387 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2392 if ( (fp
->f_fglob
->fg_flag
& O_APPEND
) == 0 ) {
2393 flags
|= FOF_OFFSET
;
2398 * Needs vfs_context_t from vfs_context_create() in entryp!
2400 context
.vc_thread
= proc_thread(entryp
->procp
); /* XXX */
2401 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2403 /* NB: tell dofilewrite the offset, and to use the proc cred */
2404 error
= dofilewrite(&context
,
2406 entryp
->aiocb
.aio_buf
,
2407 entryp
->aiocb
.aio_nbytes
,
2408 entryp
->aiocb
.aio_offset
,
2410 &entryp
->returnval
);
2412 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2416 } /* do_aio_write */
2420 * aio_active_requests_for_process - return number of active async IO
2421 * requests for the given process.
2424 aio_active_requests_for_process(proc_t procp
)
2426 return( procp
->p_aio_active_count
);
2428 } /* aio_active_requests_for_process */
2431 * Called with the proc locked.
2434 aio_proc_active_requests_for_file(proc_t procp
, int fd
)
2437 aio_workq_entry
*entryp
;
2438 TAILQ_FOREACH(entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2439 if (entryp
->aiocb
.aio_fildes
== fd
) {
2445 } /* aio_active_requests_for_process */
2453 do_aio_fsync( aio_workq_entry
*entryp
)
2455 struct vfs_context context
;
2457 struct fileproc
*fp
;
2462 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2464 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2465 * to mark for update the metadata not strictly necessary for data
2466 * retrieval, rather than forcing it to disk.
2468 * If AIO_FSYNC is set, we have to also wait for metadata not really
2469 * necessary to data retrival are committed to stable storage (e.g.
2470 * atime, mtime, ctime, etc.).
2472 * Metadata necessary for data retrieval ust be committed to stable
2473 * storage in either case (file length, etc.).
2475 if (entryp
->flags
& AIO_FSYNC
)
2476 sync_flag
= MNT_WAIT
;
2478 sync_flag
= MNT_DWAIT
;
2480 error
= fp_getfvp( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, &vp
);
2482 if ( (error
= vnode_getwithref(vp
)) ) {
2483 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2484 entryp
->returnval
= -1;
2487 context
.vc_thread
= current_thread();
2488 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2490 error
= VNOP_FSYNC( vp
, sync_flag
, &context
);
2492 (void)vnode_put(vp
);
2494 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2497 entryp
->returnval
= -1;
2501 } /* do_aio_fsync */
2505 * is_already_queued - runs through our queues to see if the given
2506 * aiocbp / process is there. Returns TRUE if there is a match
2507 * on any of our aio queues.
2509 * Called with proc aio lock held (can be held spin)
2512 is_already_queued(proc_t procp
,
2513 user_addr_t aiocbp
)
2515 aio_workq_entry
*entryp
;
2520 /* look for matches on our queue of async IO requests that have completed */
2521 TAILQ_FOREACH( entryp
, &procp
->p_aio_doneq
, aio_proc_link
) {
2522 if ( aiocbp
== entryp
->uaiocbp
) {
2524 goto ExitThisRoutine
;
2528 /* look for matches on our queue of active async IO requests */
2529 TAILQ_FOREACH( entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2530 if ( aiocbp
== entryp
->uaiocbp
) {
2532 goto ExitThisRoutine
;
2539 } /* is_already_queued */
2543 free_lio_context(aio_lio_context
* context
)
2547 OSDecrementAtomic(&lio_contexts_alloced
);
2550 FREE( context
, M_TEMP
);
2552 } /* free_lio_context */
2556 * aio initialization
2558 __private_extern__
void
2563 aio_lock_grp_attr
= lck_grp_attr_alloc_init();
2564 aio_proc_lock_grp
= lck_grp_alloc_init("aio_proc", aio_lock_grp_attr
);;
2565 aio_entry_lock_grp
= lck_grp_alloc_init("aio_entry", aio_lock_grp_attr
);;
2566 aio_queue_lock_grp
= lck_grp_alloc_init("aio_queue", aio_lock_grp_attr
);;
2567 aio_lock_attr
= lck_attr_alloc_init();
2569 lck_mtx_init(&aio_entry_mtx
, aio_entry_lock_grp
, aio_lock_attr
);
2570 lck_mtx_init(&aio_proc_mtx
, aio_proc_lock_grp
, aio_lock_attr
);
2572 aio_anchor
.aio_inflight_count
= 0;
2573 aio_anchor
.aio_done_count
= 0;
2574 aio_anchor
.aio_total_count
= 0;
2575 aio_anchor
.aio_num_workqs
= AIO_NUM_WORK_QUEUES
;
2577 for (i
= 0; i
< AIO_NUM_WORK_QUEUES
; i
++) {
2578 aio_workq_init(&aio_anchor
.aio_async_workqs
[i
]);
2582 i
= sizeof( aio_workq_entry
);
2583 aio_workq_zonep
= zinit( i
, i
* aio_max_requests
, i
* aio_max_requests
, "aiowq" );
2585 _aio_create_worker_threads( aio_worker_threads
);
2591 * aio worker threads created here.
2593 __private_extern__
void
2594 _aio_create_worker_threads( int num
)
2598 /* create some worker threads to handle the async IO requests */
2599 for ( i
= 0; i
< num
; i
++ ) {
2602 if ( KERN_SUCCESS
!= kernel_thread_start((thread_continue_t
)aio_work_thread
, NULL
, &myThread
) ) {
2603 printf( "%s - failed to create a work thread \n", __FUNCTION__
);
2606 thread_deallocate(myThread
);
2611 } /* _aio_create_worker_threads */
2614 * Return the current activation utask
2619 return ((struct uthread
*)get_bsdthread_info(current_thread()))->uu_aio_task
;
2624 * In the case of an aiocb from a
2625 * 32-bit process we need to expand some longs and pointers to the correct
2626 * sizes in order to let downstream code always work on the same type of
2627 * aiocb (in our case that is a user_aiocb)
2630 do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2632 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2633 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2634 the_user_aiocbp
->aio_buf
= CAST_USER_ADDR_T(my_aiocbp
->aio_buf
);
2635 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2636 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2637 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2639 /* special case here. since we do not know if sigev_value is an */
2640 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2641 /* means if we send this info back to user space we need to remember */
2642 /* sigev_value was not expanded for the 32-bit case. */
2643 /* NOTE - this does NOT affect us since we don't support sigev_value */
2644 /* yet in the aio context. */
2646 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2647 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2648 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2649 my_aiocbp
->aio_sigevent
.sigev_value
.sival_int
;
2650 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2651 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_function
);
2652 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2653 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_attributes
);
2656 /* Similar for 64-bit user process, so that we don't need to satisfy
2657 * the alignment constraints of the original user64_aiocb
2660 do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2662 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2663 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2664 the_user_aiocbp
->aio_buf
= my_aiocbp
->aio_buf
;
2665 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2666 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2667 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2669 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2670 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2671 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2672 my_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
;
2673 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2674 my_aiocbp
->aio_sigevent
.sigev_notify_function
;
2675 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2676 my_aiocbp
->aio_sigevent
.sigev_notify_attributes
;