2 * Copyright (c) 2003-2014 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
62 #include <machine/limits.h>
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/zalloc.h>
67 #include <kern/task.h>
68 #include <kern/sched_prim.h>
70 #include <vm/vm_map.h>
72 #include <libkern/OSAtomic.h>
74 #include <sys/kdebug.h>
75 #define AIO_work_queued 1
76 #define AIO_worker_wake 2
77 #define AIO_completion_sig 3
78 #define AIO_completion_cleanup_wait 4
79 #define AIO_completion_cleanup_wake 5
80 #define AIO_completion_suspend_wake 6
81 #define AIO_fsync_delay 7
83 #define AIO_cancel_async_workq 11
84 #define AIO_cancel_sync_workq 12
85 #define AIO_cancel_activeq 13
86 #define AIO_cancel_doneq 14
92 #define AIO_error_val 61
93 #define AIO_error_activeq 62
94 #define AIO_error_workq 63
96 #define AIO_return_val 71
97 #define AIO_return_activeq 72
98 #define AIO_return_workq 73
101 #define AIO_exit_sleep 91
102 #define AIO_close 100
103 #define AIO_close_sleep 101
104 #define AIO_suspend 110
105 #define AIO_suspend_sleep 111
106 #define AIO_worker_thread 120
110 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
114 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
115 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
116 * (proc.aio_activeq) when one of our worker threads start the IO.
117 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
118 * when the IO request completes. The request remains on aio_doneq until
119 * user process calls aio_return or the process exits, either way that is our
120 * trigger to release aio resources.
122 typedef struct aio_workq
{
123 TAILQ_HEAD(, aio_workq_entry
) aioq_entries
;
126 wait_queue_t aioq_waitq
;
129 #define AIO_NUM_WORK_QUEUES 1
132 volatile int32_t aio_inflight_count
; /* entries that have been taken from a workq */
133 volatile int32_t aio_done_count
; /* entries on all done queues (proc.aio_doneq) */
134 volatile int32_t aio_total_count
; /* total extant entries */
136 /* Hash table of queues here */
138 struct aio_workq aio_async_workqs
[AIO_NUM_WORK_QUEUES
];
140 typedef struct aio_anchor_cb aio_anchor_cb
;
142 struct aio_lio_context
148 typedef struct aio_lio_context aio_lio_context
;
152 * Notes on aio sleep / wake channels.
153 * We currently pick a couple fields within the proc structure that will allow
154 * us sleep channels that currently do not collide with any other kernel routines.
155 * At this time, for binary compatibility reasons, we cannot create new proc fields.
157 #define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
158 #define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
160 #define ASSERT_AIO_FROM_PROC(aiop, theproc) \
161 if ((aiop)->procp != (theproc)) { \
162 panic("AIO on a proc list that does not belong to that proc.\n"); \
168 static void aio_proc_lock(proc_t procp
);
169 static void aio_proc_lock_spin(proc_t procp
);
170 static void aio_proc_unlock(proc_t procp
);
171 static lck_mtx_t
* aio_proc_mutex(proc_t procp
);
172 static void aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
173 static void aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
174 static int aio_get_process_count(proc_t procp
);
175 static int aio_active_requests_for_process(proc_t procp
);
176 static int aio_proc_active_requests_for_file(proc_t procp
, int fd
);
177 static boolean_t
is_already_queued(proc_t procp
, user_addr_t aiocbp
);
178 static boolean_t
should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
);
180 static void aio_entry_lock(aio_workq_entry
*entryp
);
181 static void aio_entry_lock_spin(aio_workq_entry
*entryp
);
182 static aio_workq_t
aio_entry_workq(aio_workq_entry
*entryp
);
183 static lck_mtx_t
* aio_entry_mutex(__unused aio_workq_entry
*entryp
);
184 static void aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
185 static void aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
186 static void aio_entry_ref_locked(aio_workq_entry
*entryp
);
187 static void aio_entry_unref_locked(aio_workq_entry
*entryp
);
188 static void aio_entry_ref(aio_workq_entry
*entryp
);
189 static void aio_entry_unref(aio_workq_entry
*entryp
);
190 static void aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
,
191 int wait_for_completion
, boolean_t disable_notification
);
192 static int aio_entry_try_workq_remove(aio_workq_entry
*entryp
);
193 static boolean_t
aio_delay_fsync_request( aio_workq_entry
*entryp
);
194 static int aio_free_request(aio_workq_entry
*entryp
);
196 static void aio_workq_init(aio_workq_t wq
);
197 static void aio_workq_lock_spin(aio_workq_t wq
);
198 static void aio_workq_unlock(aio_workq_t wq
);
199 static lck_mtx_t
* aio_workq_mutex(aio_workq_t wq
);
201 static void aio_work_thread( void );
202 static aio_workq_entry
*aio_get_some_work( void );
204 static int aio_get_all_queues_count( void );
205 static int aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
);
206 static int aio_validate( aio_workq_entry
*entryp
);
207 static int aio_increment_total_count(void);
208 static int aio_decrement_total_count(void);
210 static int do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
, int wait_for_completion
, boolean_t disable_notification
);
211 static void do_aio_completion( aio_workq_entry
*entryp
);
212 static int do_aio_fsync( aio_workq_entry
*entryp
);
213 static int do_aio_read( aio_workq_entry
*entryp
);
214 static int do_aio_write( aio_workq_entry
*entryp
);
215 static void do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
216 static void do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
217 static int lio_create_entry(proc_t procp
,
220 aio_workq_entry
**entrypp
);
221 static aio_workq_entry
*aio_create_queue_entry(proc_t procp
,
225 static user_addr_t
*aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
);
226 static void free_lio_context(aio_lio_context
* context
);
227 static void aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
);
229 #define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
230 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
231 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
234 * EXTERNAL PROTOTYPES
237 /* in ...bsd/kern/sys_generic.c */
238 extern int dofileread(vfs_context_t ctx
, struct fileproc
*fp
,
239 user_addr_t bufp
, user_size_t nbyte
,
240 off_t offset
, int flags
, user_ssize_t
*retval
);
241 extern int dofilewrite(vfs_context_t ctx
, struct fileproc
*fp
,
242 user_addr_t bufp
, user_size_t nbyte
, off_t offset
,
243 int flags
, user_ssize_t
*retval
);
245 static uint32_t lio_contexts_alloced
= 0;
249 * aio external global variables.
251 extern int aio_max_requests
; /* AIO_MAX - configurable */
252 extern int aio_max_requests_per_process
; /* AIO_PROCESS_MAX - configurable */
253 extern int aio_worker_threads
; /* AIO_THREAD_COUNT - configurable */
257 * aio static variables.
259 static aio_anchor_cb aio_anchor
;
260 static lck_grp_t
*aio_proc_lock_grp
;
261 static lck_grp_t
*aio_entry_lock_grp
;
262 static lck_grp_t
*aio_queue_lock_grp
;
263 static lck_attr_t
*aio_lock_attr
;
264 static lck_grp_attr_t
*aio_lock_grp_attr
;
265 static struct zone
*aio_workq_zonep
;
266 static lck_mtx_t aio_entry_mtx
;
267 static lck_mtx_t aio_proc_mtx
;
270 aio_entry_lock(__unused aio_workq_entry
*entryp
)
272 lck_mtx_lock(&aio_entry_mtx
);
276 aio_entry_lock_spin(__unused aio_workq_entry
*entryp
)
278 lck_mtx_lock_spin(&aio_entry_mtx
);
282 aio_entry_unlock(__unused aio_workq_entry
*entryp
)
284 lck_mtx_unlock(&aio_entry_mtx
);
289 aio_entry_workq(__unused aio_workq_entry
*entryp
)
291 return &aio_anchor
.aio_async_workqs
[0];
295 aio_entry_mutex(__unused aio_workq_entry
*entryp
)
297 return &aio_entry_mtx
;
301 aio_workq_init(aio_workq_t wq
)
303 TAILQ_INIT(&wq
->aioq_entries
);
305 lck_mtx_init(&wq
->aioq_mtx
, aio_queue_lock_grp
, aio_lock_attr
);
306 wq
->aioq_waitq
= wait_queue_alloc(SYNC_POLICY_FIFO
);
311 * Can be passed a queue which is locked spin.
314 aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
316 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
318 if (entryp
->aio_workq_link
.tqe_prev
== NULL
) {
319 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
322 TAILQ_REMOVE(&queue
->aioq_entries
, entryp
, aio_workq_link
);
324 entryp
->aio_workq_link
.tqe_prev
= NULL
; /* Not on a workq */
326 if (queue
->aioq_count
< 0) {
327 panic("Negative count on a queue.\n");
332 aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
334 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
336 TAILQ_INSERT_TAIL(&queue
->aioq_entries
, entryp
, aio_workq_link
);
337 if (queue
->aioq_count
< 0) {
338 panic("Negative count on a queue.\n");
344 aio_proc_lock(proc_t procp
)
346 lck_mtx_lock(aio_proc_mutex(procp
));
350 aio_proc_lock_spin(proc_t procp
)
352 lck_mtx_lock_spin(aio_proc_mutex(procp
));
356 aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
358 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
360 TAILQ_REMOVE(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
361 TAILQ_INSERT_TAIL( &procp
->p_aio_doneq
, entryp
, aio_proc_link
);
362 procp
->p_aio_active_count
--;
363 OSIncrementAtomic(&aio_anchor
.aio_done_count
);
367 aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
369 TAILQ_REMOVE(&procp
->p_aio_doneq
, entryp
, aio_proc_link
);
370 OSDecrementAtomic(&aio_anchor
.aio_done_count
);
371 aio_decrement_total_count();
372 procp
->p_aio_total_count
--;
376 aio_proc_unlock(proc_t procp
)
378 lck_mtx_unlock(aio_proc_mutex(procp
));
382 aio_proc_mutex(proc_t procp
)
384 return &procp
->p_mlock
;
388 aio_entry_ref_locked(aio_workq_entry
*entryp
)
390 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
392 if (entryp
->aio_refcount
< 0) {
393 panic("AIO workq entry with a negative refcount.\n");
395 entryp
->aio_refcount
++;
399 /* Return 1 if you've freed it */
401 aio_entry_unref_locked(aio_workq_entry
*entryp
)
403 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
405 entryp
->aio_refcount
--;
406 if (entryp
->aio_refcount
< 0) {
407 panic("AIO workq entry with a negative refcount.\n");
412 aio_entry_ref(aio_workq_entry
*entryp
)
414 aio_entry_lock_spin(entryp
);
415 aio_entry_ref_locked(entryp
);
416 aio_entry_unlock(entryp
);
419 aio_entry_unref(aio_workq_entry
*entryp
)
421 aio_entry_lock_spin(entryp
);
422 aio_entry_unref_locked(entryp
);
424 if ((entryp
->aio_refcount
== 0) && ((entryp
->flags
& AIO_DO_FREE
) != 0)) {
425 aio_entry_unlock(entryp
);
426 aio_free_request(entryp
);
428 aio_entry_unlock(entryp
);
435 aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
, int wait_for_completion
, boolean_t disable_notification
)
437 aio_entry_lock_spin(entryp
);
440 aio_entry_ref_locked(entryp
);
441 entryp
->errorval
= ECANCELED
;
442 entryp
->returnval
= -1;
445 if ( wait_for_completion
) {
446 entryp
->flags
|= wait_for_completion
; /* flag for special completion processing */
449 if ( disable_notification
) {
450 entryp
->flags
|= AIO_DISABLE
; /* Don't want a signal */
453 aio_entry_unlock(entryp
);
457 aio_entry_try_workq_remove(aio_workq_entry
*entryp
)
459 /* Can only be cancelled if it's still on a work queue */
460 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
463 /* Will have to check again under the lock */
464 queue
= aio_entry_workq(entryp
);
465 aio_workq_lock_spin(queue
);
466 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
467 aio_workq_remove_entry_locked(queue
, entryp
);
468 aio_workq_unlock(queue
);
471 aio_workq_unlock(queue
);
479 aio_workq_lock_spin(aio_workq_t wq
)
481 lck_mtx_lock_spin(aio_workq_mutex(wq
));
485 aio_workq_unlock(aio_workq_t wq
)
487 lck_mtx_unlock(aio_workq_mutex(wq
));
491 aio_workq_mutex(aio_workq_t wq
)
493 return &wq
->aioq_mtx
;
497 * aio_cancel - attempt to cancel one or more async IO requests currently
498 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
499 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
500 * is NULL then all outstanding async IO request for the given file
501 * descriptor are cancelled (if possible).
504 aio_cancel(proc_t p
, struct aio_cancel_args
*uap
, int *retval
)
506 struct user_aiocb my_aiocb
;
509 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_START
,
510 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
512 /* quick check to see if there are any async IO requests queued up */
513 if (aio_get_all_queues_count() < 1) {
515 *retval
= AIO_ALLDONE
;
520 if ( uap
->aiocbp
!= USER_ADDR_NULL
) {
521 if ( proc_is64bit(p
) ) {
522 struct user64_aiocb aiocb64
;
524 result
= copyin( uap
->aiocbp
, &aiocb64
, sizeof(aiocb64
) );
526 do_munge_aiocb_user64_to_user(&aiocb64
, &my_aiocb
);
529 struct user32_aiocb aiocb32
;
531 result
= copyin( uap
->aiocbp
, &aiocb32
, sizeof(aiocb32
) );
533 do_munge_aiocb_user32_to_user( &aiocb32
, &my_aiocb
);
541 /* NOTE - POSIX standard says a mismatch between the file */
542 /* descriptor passed in and the file descriptor embedded in */
543 /* the aiocb causes unspecified results. We return EBADF in */
544 /* that situation. */
545 if ( uap
->fd
!= my_aiocb
.aio_fildes
) {
552 result
= do_aio_cancel_locked( p
, uap
->fd
, uap
->aiocbp
, 0, FALSE
);
553 ASSERT_AIO_PROC_LOCK_OWNED(p
);
556 if ( result
!= -1 ) {
565 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_END
,
566 (int)p
, (int)uap
->aiocbp
, result
, 0, 0 );
574 * _aio_close - internal function used to clean up async IO requests for
575 * a file descriptor that is closing.
578 __private_extern__
void
579 _aio_close(proc_t p
, int fd
)
583 /* quick check to see if there are any async IO requests queued up */
584 if (aio_get_all_queues_count() < 1) {
588 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_START
,
589 (int)p
, fd
, 0, 0, 0 );
591 /* cancel all async IO requests on our todo queues for this file descriptor */
593 error
= do_aio_cancel_locked( p
, fd
, 0, AIO_CLOSE_WAIT
, FALSE
);
594 ASSERT_AIO_PROC_LOCK_OWNED(p
);
595 if ( error
== AIO_NOTCANCELED
) {
597 * AIO_NOTCANCELED is returned when we find an aio request for this process
598 * and file descriptor on the active async IO queue. Active requests cannot
599 * be cancelled so we must wait for them to complete. We will get a special
600 * wake up call on our channel used to sleep for ALL active requests to
601 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
602 * when we must wait for all active aio requests.
605 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close_sleep
)) | DBG_FUNC_NONE
,
606 (int)p
, fd
, 0, 0, 0 );
608 while (aio_proc_active_requests_for_file(p
, fd
) > 0) {
609 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
, "aio_close", 0 );
616 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_END
,
617 (int)p
, fd
, 0, 0, 0 );
625 * aio_error - return the error status associated with the async IO
626 * request referred to by uap->aiocbp. The error status is the errno
627 * value that would be set by the corresponding IO request (read, wrtie,
628 * fdatasync, or sync).
631 aio_error(proc_t p
, struct aio_error_args
*uap
, int *retval
)
633 aio_workq_entry
*entryp
;
636 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_START
,
637 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
639 /* see if there are any aios to check */
640 if (aio_get_all_queues_count() < 1) {
646 /* look for a match on our queue of async IO requests that have completed */
647 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
648 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
649 ASSERT_AIO_FROM_PROC(entryp
, p
);
651 aio_entry_lock_spin(entryp
);
652 *retval
= entryp
->errorval
;
654 aio_entry_unlock(entryp
);
655 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_val
)) | DBG_FUNC_NONE
,
656 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
661 /* look for a match on our queue of active async IO requests */
662 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
663 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
664 ASSERT_AIO_FROM_PROC(entryp
, p
);
665 *retval
= EINPROGRESS
;
667 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_activeq
)) | DBG_FUNC_NONE
,
668 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
676 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_END
,
677 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
686 * aio_fsync - asynchronously force all IO operations associated
687 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
688 * queued at the time of the call to the synchronized completion state.
689 * NOTE - we do not support op O_DSYNC at this point since we do not support the
693 aio_fsync(proc_t p
, struct aio_fsync_args
*uap
, int *retval
)
698 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_START
,
699 (int)p
, (int)uap
->aiocbp
, uap
->op
, 0, 0 );
702 /* 0 := O_SYNC for binary backward compatibility with Panther */
703 if (uap
->op
== O_SYNC
|| uap
->op
== 0)
704 fsync_kind
= AIO_FSYNC
;
705 else if ( uap
->op
== O_DSYNC
)
706 fsync_kind
= AIO_DSYNC
;
713 error
= aio_queue_async_request( p
, uap
->aiocbp
, fsync_kind
);
718 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_END
,
719 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
726 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
727 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
728 * (uap->aiocbp->aio_buf).
731 aio_read(proc_t p
, struct aio_read_args
*uap
, int *retval
)
735 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_START
,
736 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
740 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_READ
);
744 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_END
,
745 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
753 * aio_return - return the return status associated with the async IO
754 * request referred to by uap->aiocbp. The return status is the value
755 * that would be returned by corresponding IO request (read, write,
756 * fdatasync, or sync). This is where we release kernel resources
757 * held for async IO call associated with the given aiocb pointer.
760 aio_return(proc_t p
, struct aio_return_args
*uap
, user_ssize_t
*retval
)
762 aio_workq_entry
*entryp
;
764 boolean_t proc_lock_held
= FALSE
;
766 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_START
,
767 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
769 /* See if there are any entries to check */
770 if (aio_get_all_queues_count() < 1) {
776 proc_lock_held
= TRUE
;
779 /* look for a match on our queue of async IO requests that have completed */
780 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
781 ASSERT_AIO_FROM_PROC(entryp
, p
);
782 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
783 /* Done and valid for aio_return(), pull it off the list */
784 aio_proc_remove_done_locked(p
, entryp
);
786 /* Drop the proc lock, but keep the entry locked */
787 aio_entry_lock(entryp
);
789 proc_lock_held
= FALSE
;
791 *retval
= entryp
->returnval
;
794 /* No references and off all lists, safe to free */
795 if (entryp
->aio_refcount
== 0) {
796 aio_entry_unlock(entryp
);
797 aio_free_request(entryp
);
800 /* Whoever has the refcount will have to free it */
801 entryp
->flags
|= AIO_DO_FREE
;
802 aio_entry_unlock(entryp
);
806 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_val
)) | DBG_FUNC_NONE
,
807 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
812 /* look for a match on our queue of active async IO requests */
813 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
814 ASSERT_AIO_FROM_PROC(entryp
, p
);
815 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
817 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_activeq
)) | DBG_FUNC_NONE
,
818 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
828 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_END
,
829 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
837 * _aio_exec - internal function used to clean up async IO requests for
838 * a process that is going away due to exec(). We cancel any async IOs
839 * we can and wait for those already active. We also disable signaling
840 * for cancelled or active aio requests that complete.
841 * This routine MAY block!
843 __private_extern__
void
847 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_START
,
848 (int)p
, 0, 0, 0, 0 );
852 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_END
,
853 (int)p
, 0, 0, 0, 0 );
861 * _aio_exit - internal function used to clean up async IO requests for
862 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
863 * we can and wait for those already active. We also disable signaling
864 * for cancelled or active aio requests that complete. This routine MAY block!
866 __private_extern__
void
870 aio_workq_entry
*entryp
;
873 /* quick check to see if there are any async IO requests queued up */
874 if (aio_get_all_queues_count() < 1) {
878 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_START
,
879 (int)p
, 0, 0, 0, 0 );
884 * cancel async IO requests on the todo work queue and wait for those
885 * already active to complete.
887 error
= do_aio_cancel_locked( p
, 0, 0, AIO_EXIT_WAIT
, TRUE
);
888 ASSERT_AIO_PROC_LOCK_OWNED(p
);
889 if ( error
== AIO_NOTCANCELED
) {
891 * AIO_NOTCANCELED is returned when we find an aio request for this process
892 * on the active async IO queue. Active requests cannot be cancelled so we
893 * must wait for them to complete. We will get a special wake up call on
894 * our channel used to sleep for ALL active requests to complete. This sleep
895 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
896 * active aio requests.
899 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit_sleep
)) | DBG_FUNC_NONE
,
900 (int)p
, 0, 0, 0, 0 );
902 while (p
->p_aio_active_count
!= 0) {
903 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
, "aio_exit", 0 );
907 if (p
->p_aio_active_count
!= 0) {
908 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p
->p_aio_active_count
);
911 /* release all aio resources used by this process */
912 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
913 while ( entryp
!= NULL
) {
914 ASSERT_AIO_FROM_PROC(entryp
, p
);
915 aio_workq_entry
*next_entryp
;
917 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
918 aio_proc_remove_done_locked(p
, entryp
);
920 /* we cannot free requests that are still completing */
921 aio_entry_lock_spin(entryp
);
922 if (entryp
->aio_refcount
== 0) {
924 aio_entry_unlock(entryp
);
925 aio_free_request(entryp
);
927 /* need to start over since aio_doneq may have been */
928 /* changed while we were away. */
930 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
934 /* whoever has the reference will have to do the free */
935 entryp
->flags
|= AIO_DO_FREE
;
938 aio_entry_unlock(entryp
);
939 entryp
= next_entryp
;
944 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_END
,
945 (int)p
, 0, 0, 0, 0 );
952 should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
)
954 if ( (aiocbp
== USER_ADDR_NULL
&& fd
== 0) ||
955 (aiocbp
!= USER_ADDR_NULL
&& entryp
->uaiocbp
== aiocbp
) ||
956 (aiocbp
== USER_ADDR_NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
964 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
965 * aio_cancel, close, and at exit.
966 * There are three modes of operation: 1) cancel all async IOs for a process -
967 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
968 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
970 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
971 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
972 * target async IO requests, and AIO_ALLDONE if all target async IO requests
973 * were already complete.
974 * WARNING - do not deference aiocbp in this routine, it may point to user
975 * land data that has not been copied in (when called from aio_cancel() )
977 * Called with proc locked, and returns the same way.
980 do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
,
981 int wait_for_completion
, boolean_t disable_notification
)
983 ASSERT_AIO_PROC_LOCK_OWNED(p
);
985 aio_workq_entry
*entryp
;
990 /* look for a match on our queue of async todo work. */
991 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
992 while ( entryp
!= NULL
) {
993 ASSERT_AIO_FROM_PROC(entryp
, p
);
994 aio_workq_entry
*next_entryp
;
996 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
997 if (!should_cancel(entryp
, aiocbp
, fd
)) {
998 entryp
= next_entryp
;
1002 /* Can only be cancelled if it's still on a work queue */
1003 if (aio_entry_try_workq_remove(entryp
) != 0) {
1004 /* Have removed from workq. Update entry state and take a ref */
1005 aio_entry_update_for_cancel(entryp
, TRUE
, 0, disable_notification
);
1007 /* Put on the proc done queue and update counts, then unlock the proc */
1008 aio_proc_move_done_locked(p
, entryp
);
1011 /* Now it's officially cancelled. Do the completion */
1012 result
= AIO_CANCELED
;
1013 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_async_workq
)) | DBG_FUNC_NONE
,
1014 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1015 do_aio_completion(entryp
);
1017 /* This will free if the aio_return() has already happened ... */
1018 aio_entry_unref(entryp
);
1021 if ( aiocbp
!= USER_ADDR_NULL
) {
1026 * Restart from the head of the proc active queue since it
1027 * may have been changed while we were away doing completion
1030 * Note that if we found an uncancellable AIO before, we will
1031 * either find it again or discover that it's been completed,
1032 * so resetting the result will not cause us to return success
1033 * despite outstanding AIOs.
1035 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
1036 result
= -1; /* As if beginning anew */
1039 * It's been taken off the active queue already, i.e. is in flight.
1040 * All we can do is ask for notification.
1042 result
= AIO_NOTCANCELED
;
1044 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_activeq
)) | DBG_FUNC_NONE
,
1045 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1047 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1048 aio_entry_update_for_cancel(entryp
, FALSE
, wait_for_completion
, disable_notification
);
1050 if ( aiocbp
!= USER_ADDR_NULL
) {
1053 entryp
= next_entryp
;
1058 * if we didn't find any matches on the todo or active queues then look for a
1059 * match on our queue of async IO requests that have completed and if found
1060 * return AIO_ALLDONE result.
1062 * Proc AIO lock is still held.
1064 if ( result
== -1 ) {
1065 TAILQ_FOREACH(entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1066 ASSERT_AIO_FROM_PROC(entryp
, p
);
1067 if (should_cancel(entryp
, aiocbp
, fd
)) {
1068 result
= AIO_ALLDONE
;
1069 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_doneq
)) | DBG_FUNC_NONE
,
1070 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1072 if ( aiocbp
!= USER_ADDR_NULL
) {
1082 /* do_aio_cancel_locked */
1086 * aio_suspend - suspend the calling thread until at least one of the async
1087 * IO operations referenced by uap->aiocblist has completed, until a signal
1088 * interrupts the function, or uap->timeoutp time interval (optional) has
1090 * Returns 0 if one or more async IOs have completed else -1 and errno is
1091 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1095 aio_suspend(proc_t p
, struct aio_suspend_args
*uap
, int *retval
)
1097 __pthread_testcancel(1);
1098 return(aio_suspend_nocancel(p
, (struct aio_suspend_nocancel_args
*)uap
, retval
));
1103 aio_suspend_nocancel(proc_t p
, struct aio_suspend_nocancel_args
*uap
, int *retval
)
1108 struct user_timespec ts
;
1109 aio_workq_entry
*entryp
;
1110 user_addr_t
*aiocbpp
;
1112 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_START
,
1113 (int)p
, uap
->nent
, 0, 0, 0 );
1119 count
= aio_get_all_queues_count( );
1122 goto ExitThisRoutine
;
1125 if ( uap
->nent
< 1 || uap
->nent
> aio_max_requests_per_process
) {
1127 goto ExitThisRoutine
;
1130 if ( uap
->timeoutp
!= USER_ADDR_NULL
) {
1131 if ( proc_is64bit(p
) ) {
1132 struct user64_timespec temp
;
1133 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
) );
1135 ts
.tv_sec
= temp
.tv_sec
;
1136 ts
.tv_nsec
= temp
.tv_nsec
;
1140 struct user32_timespec temp
;
1141 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
) );
1143 ts
.tv_sec
= temp
.tv_sec
;
1144 ts
.tv_nsec
= temp
.tv_nsec
;
1149 goto ExitThisRoutine
;
1152 if ( ts
.tv_sec
< 0 || ts
.tv_nsec
< 0 || ts
.tv_nsec
>= 1000000000 ) {
1154 goto ExitThisRoutine
;
1157 nanoseconds_to_absolutetime( (uint64_t)ts
.tv_sec
* NSEC_PER_SEC
+ ts
.tv_nsec
,
1159 clock_absolutetime_interval_to_deadline( abstime
, &abstime
);
1162 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1163 if ( aiocbpp
== NULL
) {
1165 goto ExitThisRoutine
;
1168 /* check list of aio requests to see if any have completed */
1169 check_for_our_aiocbp
:
1170 aio_proc_lock_spin(p
);
1171 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1174 /* NULL elements are legal so check for 'em */
1175 aiocbp
= *(aiocbpp
+ i
);
1176 if ( aiocbp
== USER_ADDR_NULL
)
1179 /* return immediately if any aio request in the list is done */
1180 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1181 ASSERT_AIO_FROM_PROC(entryp
, p
);
1182 if ( entryp
->uaiocbp
== aiocbp
) {
1186 goto ExitThisRoutine
;
1189 } /* for ( ; i < uap->nent; ) */
1191 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend_sleep
)) | DBG_FUNC_NONE
,
1192 (int)p
, uap
->nent
, 0, 0, 0 );
1195 * wait for an async IO to complete or a signal fires or timeout expires.
1196 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1197 * interrupts us. If an async IO completes before a signal fires or our
1198 * timeout expires, we get a wakeup call from aio_work_thread().
1201 error
= msleep1(&p
->AIO_SUSPEND_SLEEP_CHAN
, aio_proc_mutex(p
), PCATCH
| PWAIT
| PDROP
, "aio_suspend", abstime
); /* XXX better priority? */
1204 * got our wakeup call from aio_work_thread().
1205 * Since we can get a wakeup on this channel from another thread in the
1206 * same process we head back up to make sure this is for the correct aiocbp.
1207 * If it is the correct aiocbp we will return from where we do the check
1208 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1209 * else we will fall out and just sleep again.
1211 goto check_for_our_aiocbp
;
1213 else if ( error
== EWOULDBLOCK
) {
1214 /* our timeout expired */
1218 /* we were interrupted */
1223 if ( aiocbpp
!= NULL
)
1224 FREE( aiocbpp
, M_TEMP
);
1226 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_END
,
1227 (int)p
, uap
->nent
, error
, 0, 0 );
1234 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1235 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1236 * (uap->aiocbp->aio_buf).
1240 aio_write(proc_t p
, struct aio_write_args
*uap
, int *retval
)
1246 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_START
,
1247 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
1249 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_WRITE
);
1253 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_END
,
1254 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
1261 static user_addr_t
*
1262 aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
)
1264 user_addr_t
*aiocbpp
;
1267 /* we reserve enough space for largest possible pointer size */
1268 MALLOC( aiocbpp
, user_addr_t
*, (nent
* sizeof(user_addr_t
)), M_TEMP
, M_WAITOK
);
1269 if ( aiocbpp
== NULL
)
1272 /* copyin our aiocb pointers from list */
1273 result
= copyin( aiocblist
, aiocbpp
,
1274 proc_is64bit(procp
) ? (nent
* sizeof(user64_addr_t
))
1275 : (nent
* sizeof(user32_addr_t
)) );
1277 FREE( aiocbpp
, M_TEMP
);
1283 * We depend on a list of user_addr_t's so we need to
1284 * munge and expand when these pointers came from a
1287 if ( !proc_is64bit(procp
) ) {
1288 /* copy from last to first to deal with overlap */
1289 user32_addr_t
*my_ptrp
= ((user32_addr_t
*)aiocbpp
) + (nent
- 1);
1290 user_addr_t
*my_addrp
= aiocbpp
+ (nent
- 1);
1292 for (i
= 0; i
< nent
; i
++, my_ptrp
--, my_addrp
--) {
1293 *my_addrp
= (user_addr_t
) (*my_ptrp
);
1303 aio_copy_in_sigev(proc_t procp
, user_addr_t sigp
, struct user_sigevent
*sigev
)
1307 if (sigp
== USER_ADDR_NULL
)
1311 * We need to munge aio_sigevent since it contains pointers.
1312 * Since we do not know if sigev_value is an int or a ptr we do
1313 * NOT cast the ptr to a user_addr_t. This means if we send
1314 * this info back to user space we need to remember sigev_value
1315 * was not expanded for the 32-bit case.
1317 * Notes: This does NOT affect us since we don't support
1318 * sigev_value yet in the aio context.
1320 if ( proc_is64bit(procp
) ) {
1321 struct user64_sigevent sigevent64
;
1323 result
= copyin( sigp
, &sigevent64
, sizeof(sigevent64
) );
1324 if ( result
== 0 ) {
1325 sigev
->sigev_notify
= sigevent64
.sigev_notify
;
1326 sigev
->sigev_signo
= sigevent64
.sigev_signo
;
1327 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent64
.sigev_value
.size_equivalent
.sival_int
;
1328 sigev
->sigev_notify_function
= sigevent64
.sigev_notify_function
;
1329 sigev
->sigev_notify_attributes
= sigevent64
.sigev_notify_attributes
;
1333 struct user32_sigevent sigevent32
;
1335 result
= copyin( sigp
, &sigevent32
, sizeof(sigevent32
) );
1336 if ( result
== 0 ) {
1337 sigev
->sigev_notify
= sigevent32
.sigev_notify
;
1338 sigev
->sigev_signo
= sigevent32
.sigev_signo
;
1339 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent32
.sigev_value
.sival_int
;
1340 sigev
->sigev_notify_function
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_function
);
1341 sigev
->sigev_notify_attributes
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_attributes
);
1345 if ( result
!= 0 ) {
1356 * Queue up the entry on the aio asynchronous work queue in priority order
1357 * based on the relative priority of the request. We calculate the relative
1358 * priority using the nice value of the caller and the value
1360 * Parameters: procp Process queueing the I/O
1361 * entryp The work queue entry being queued
1363 * Returns: (void) No failure modes
1365 * Notes: This function is used for both lio_listio and aio
1367 * XXX: At some point, we may have to consider thread priority
1368 * rather than process priority, but we don't maintain the
1369 * adjusted priority for threads the POSIX way.
1372 * Called with proc locked.
1375 aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
)
1378 aio_workq_entry
*my_entryp
; /* used for insertion sort */
1380 aio_workq_t queue
= aio_entry_workq(entryp
);
1382 if (proc_locked
== 0) {
1383 aio_proc_lock(procp
);
1386 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
1388 /* Onto proc queue */
1389 TAILQ_INSERT_TAIL(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
1390 procp
->p_aio_active_count
++;
1391 procp
->p_aio_total_count
++;
1393 /* And work queue */
1394 aio_workq_lock_spin(queue
);
1395 aio_workq_add_entry_locked(queue
, entryp
);
1396 wait_queue_wakeup_one(queue
->aioq_waitq
, queue
, THREAD_AWAKENED
, -1);
1397 aio_workq_unlock(queue
);
1399 if (proc_locked
== 0) {
1400 aio_proc_unlock(procp
);
1407 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1408 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1409 * which is [0..39], with 0 not being used. In nice values, the
1410 * lower the nice value, the higher the priority.
1411 * (3) The normalized scheduling prioritiy is the highest nice value
1412 * minus the current nice value. In I/O scheduling priority, the
1413 * higher the value the lower the priority, so it is the inverse
1414 * of the nice value (the higher the number, the higher the I/O
1416 * (4) From the normalized scheduling priority, we subtract the
1417 * request priority to get the request priority value number;
1418 * this means that requests are only capable of depressing their
1419 * priority relative to other requests,
1421 entryp
->priority
= (((2 * NZERO
) - 1) - procp
->p_nice
);
1423 /* only premit depressing the priority */
1424 if (entryp
->aiocb
.aio_reqprio
< 0)
1425 entryp
->aiocb
.aio_reqprio
= 0;
1426 if (entryp
->aiocb
.aio_reqprio
> 0) {
1427 entryp
->priority
-= entryp
->aiocb
.aio_reqprio
;
1428 if (entryp
->priority
< 0)
1429 entryp
->priority
= 0;
1432 /* Insertion sort the entry; lowest ->priority to highest */
1433 TAILQ_FOREACH(my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1434 if ( entryp
->priority
<= my_entryp
->priority
) {
1435 TAILQ_INSERT_BEFORE(my_entryp
, entryp
, aio_workq_link
);
1439 if (my_entryp
== NULL
)
1440 TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1446 * lio_listio - initiate a list of IO requests. We process the list of
1447 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1448 * (mode == LIO_NOWAIT).
1450 * The caller gets error and return status for each aiocb in the list
1451 * via aio_error and aio_return. We must keep completed requests until
1452 * released by the aio_return call.
1455 lio_listio(proc_t p
, struct lio_listio_args
*uap
, int *retval
)
1461 aio_workq_entry
**entryp_listp
;
1462 user_addr_t
*aiocbpp
;
1463 struct user_sigevent aiosigev
;
1464 aio_lio_context
*lio_context
;
1465 boolean_t free_context
= FALSE
;
1467 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_START
,
1468 (int)p
, uap
->nent
, uap
->mode
, 0, 0 );
1470 entryp_listp
= NULL
;
1475 if ( !(uap
->mode
== LIO_NOWAIT
|| uap
->mode
== LIO_WAIT
) ) {
1476 call_result
= EINVAL
;
1480 if ( uap
->nent
< 1 || uap
->nent
> AIO_LISTIO_MAX
) {
1481 call_result
= EINVAL
;
1486 * allocate a list of aio_workq_entry pointers that we will use
1487 * to queue up all our requests at once while holding our lock.
1489 MALLOC( entryp_listp
, void *, (uap
->nent
* sizeof(aio_workq_entry
*)), M_TEMP
, M_WAITOK
);
1490 if ( entryp_listp
== NULL
) {
1491 call_result
= EAGAIN
;
1495 MALLOC( lio_context
, aio_lio_context
*, sizeof(aio_lio_context
), M_TEMP
, M_WAITOK
);
1496 if ( lio_context
== NULL
) {
1497 call_result
= EAGAIN
;
1502 OSIncrementAtomic(&lio_contexts_alloced
);
1505 bzero(lio_context
, sizeof(aio_lio_context
));
1507 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1508 if ( aiocbpp
== NULL
) {
1509 call_result
= EAGAIN
;
1514 * Use sigevent passed in to lio_listio for each of our calls, but
1515 * only do completion notification after the last request completes.
1517 bzero(&aiosigev
, sizeof(aiosigev
));
1518 /* Only copy in an sigev if the user supplied one */
1519 if (uap
->sigp
!= USER_ADDR_NULL
) {
1520 call_result
= aio_copy_in_sigev(p
, uap
->sigp
, &aiosigev
);
1525 /* process list of aio requests */
1526 lio_context
->io_issued
= uap
->nent
;
1527 lio_context
->io_waiter
= uap
->mode
== LIO_WAIT
? 1 : 0; /* Should it be freed by last AIO */
1528 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1529 user_addr_t my_aiocbp
;
1530 aio_workq_entry
*entryp
;
1532 *(entryp_listp
+ i
) = NULL
;
1533 my_aiocbp
= *(aiocbpp
+ i
);
1535 /* NULL elements are legal so check for 'em */
1536 if ( my_aiocbp
== USER_ADDR_NULL
) {
1537 aio_proc_lock_spin(p
);
1538 lio_context
->io_issued
--;
1544 * We use lio_context to mark IO requests for delayed completion
1545 * processing which means we wait until all IO requests in the
1546 * group have completed before we either return to the caller
1547 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1549 * We use the address of the lio_context for this, since it is
1550 * unique in the address space.
1552 result
= lio_create_entry( p
, my_aiocbp
, lio_context
, (entryp_listp
+ i
) );
1553 if ( result
!= 0 && call_result
== -1 )
1554 call_result
= result
;
1556 /* NULL elements are legal so check for 'em */
1557 entryp
= *(entryp_listp
+ i
);
1558 if ( entryp
== NULL
) {
1559 aio_proc_lock_spin(p
);
1560 lio_context
->io_issued
--;
1565 if ( uap
->mode
== LIO_NOWAIT
) {
1566 /* Set signal hander, if any */
1567 entryp
->aiocb
.aio_sigevent
= aiosigev
;
1569 /* flag that this thread blocks pending completion */
1570 entryp
->flags
|= AIO_LIO_NOTIFY
;
1573 /* check our aio limits to throttle bad or rude user land behavior */
1574 old_count
= aio_increment_total_count();
1576 aio_proc_lock_spin(p
);
1577 if ( old_count
>= aio_max_requests
||
1578 aio_get_process_count( entryp
->procp
) >= aio_max_requests_per_process
||
1579 is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1581 lio_context
->io_issued
--;
1584 aio_decrement_total_count();
1586 if ( call_result
== -1 )
1587 call_result
= EAGAIN
;
1588 aio_free_request(entryp
);
1589 entryp_listp
[i
] = NULL
;
1593 lck_mtx_convert_spin(aio_proc_mutex(p
));
1594 aio_enqueue_work(p
, entryp
, 1);
1597 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1598 (int)p
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1603 aio_proc_lock_spin(p
);
1604 while (lio_context
->io_completed
< lio_context
->io_issued
) {
1605 result
= msleep(lio_context
, aio_proc_mutex(p
), PCATCH
| PRIBIO
| PSPIN
, "lio_listio", 0);
1607 /* If we were interrupted, fail out (even if all finished) */
1609 call_result
= EINTR
;
1610 lio_context
->io_waiter
= 0;
1615 /* If all IOs have finished must free it */
1616 if (lio_context
->io_completed
== lio_context
->io_issued
) {
1617 free_context
= TRUE
;
1627 /* call_result == -1 means we had no trouble queueing up requests */
1628 if ( call_result
== -1 ) {
1634 if ( entryp_listp
!= NULL
)
1635 FREE( entryp_listp
, M_TEMP
);
1636 if ( aiocbpp
!= NULL
)
1637 FREE( aiocbpp
, M_TEMP
);
1638 if ((lio_context
!= NULL
) && ((lio_context
->io_issued
== 0) || (free_context
== TRUE
))) {
1639 free_lio_context(lio_context
);
1642 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_END
,
1643 (int)p
, call_result
, 0, 0, 0 );
1645 return( call_result
);
1651 * aio worker thread. this is where all the real work gets done.
1652 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1653 * after new work is queued up.
1656 aio_work_thread( void )
1658 aio_workq_entry
*entryp
;
1660 vm_map_t currentmap
;
1661 vm_map_t oldmap
= VM_MAP_NULL
;
1662 task_t oldaiotask
= TASK_NULL
;
1663 struct uthread
*uthreadp
= NULL
;
1667 * returns with the entry ref'ed.
1668 * sleeps until work is available.
1670 entryp
= aio_get_some_work();
1672 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_START
,
1673 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->flags
, 0, 0 );
1676 * Assume the target's address space identity for the duration
1677 * of the IO. Note: don't need to have the entryp locked,
1678 * because the proc and map don't change until it's freed.
1680 currentmap
= get_task_map( (current_proc())->task
);
1681 if ( currentmap
!= entryp
->aio_map
) {
1682 uthreadp
= (struct uthread
*) get_bsdthread_info(current_thread());
1683 oldaiotask
= uthreadp
->uu_aio_task
;
1684 uthreadp
->uu_aio_task
= entryp
->procp
->task
;
1685 oldmap
= vm_map_switch( entryp
->aio_map
);
1688 if ( (entryp
->flags
& AIO_READ
) != 0 ) {
1689 error
= do_aio_read( entryp
);
1691 else if ( (entryp
->flags
& AIO_WRITE
) != 0 ) {
1692 error
= do_aio_write( entryp
);
1694 else if ( (entryp
->flags
& (AIO_FSYNC
| AIO_DSYNC
)) != 0 ) {
1695 error
= do_aio_fsync( entryp
);
1698 printf( "%s - unknown aio request - flags 0x%02X \n",
1699 __FUNCTION__
, entryp
->flags
);
1703 /* Restore old map */
1704 if ( currentmap
!= entryp
->aio_map
) {
1705 (void) vm_map_switch( oldmap
);
1706 uthreadp
->uu_aio_task
= oldaiotask
;
1709 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_END
,
1710 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->errorval
,
1711 entryp
->returnval
, 0 );
1715 aio_entry_lock_spin(entryp
);
1716 entryp
->errorval
= error
;
1717 aio_entry_unlock(entryp
);
1719 /* we're done with the IO request so pop it off the active queue and */
1720 /* push it on the done queue */
1721 aio_proc_lock(entryp
->procp
);
1722 aio_proc_move_done_locked(entryp
->procp
, entryp
);
1723 aio_proc_unlock(entryp
->procp
);
1725 OSDecrementAtomic(&aio_anchor
.aio_inflight_count
);
1727 /* remove our reference to the user land map. */
1728 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
1731 my_map
= entryp
->aio_map
;
1732 entryp
->aio_map
= VM_MAP_NULL
;
1733 vm_map_deallocate( my_map
);
1736 /* Provide notifications */
1737 do_aio_completion( entryp
);
1739 /* Will free if needed */
1740 aio_entry_unref(entryp
);
1746 } /* aio_work_thread */
1750 * aio_get_some_work - get the next async IO request that is ready to be executed.
1751 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1752 * IO requests at the time the aio_fsync call came in have completed.
1753 * NOTE - AIO_LOCK must be held by caller
1755 static aio_workq_entry
*
1756 aio_get_some_work( void )
1758 aio_workq_entry
*entryp
= NULL
;
1759 aio_workq_t queue
= NULL
;
1761 /* Just one queue for the moment. In the future there will be many. */
1762 queue
= &aio_anchor
.aio_async_workqs
[0];
1763 aio_workq_lock_spin(queue
);
1764 if (queue
->aioq_count
== 0) {
1769 * Hold the queue lock.
1771 * pop some work off the work queue and add to our active queue
1772 * Always start with the queue lock held.
1776 * Pull of of work queue. Once it's off, it can't be cancelled,
1777 * so we can take our ref once we drop the queue lock.
1779 entryp
= TAILQ_FIRST(&queue
->aioq_entries
);
1782 * If there's no work or only fsyncs that need delay, go to sleep
1783 * and then start anew from aio_work_thread
1785 if (entryp
== NULL
) {
1789 aio_workq_remove_entry_locked(queue
, entryp
);
1791 aio_workq_unlock(queue
);
1794 * Check if it's an fsync that must be delayed. No need to lock the entry;
1795 * that flag would have been set at initialization.
1797 if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1799 * Check for unfinished operations on the same file
1800 * in this proc's queue.
1802 aio_proc_lock_spin(entryp
->procp
);
1803 if ( aio_delay_fsync_request( entryp
) ) {
1804 /* It needs to be delayed. Put it back on the end of the work queue */
1805 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync_delay
)) | DBG_FUNC_NONE
,
1806 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1808 aio_proc_unlock(entryp
->procp
);
1810 aio_workq_lock_spin(queue
);
1811 aio_workq_add_entry_locked(queue
, entryp
);
1814 aio_proc_unlock(entryp
->procp
);
1820 aio_entry_ref(entryp
);
1822 OSIncrementAtomic(&aio_anchor
.aio_inflight_count
);
1826 /* We will wake up when someone enqueues something */
1827 wait_queue_assert_wait(queue
->aioq_waitq
, queue
, THREAD_UNINT
, 0);
1828 aio_workq_unlock(queue
);
1829 thread_block( (thread_continue_t
)aio_work_thread
);
1836 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1837 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1838 * not been completed.
1841 aio_delay_fsync_request( aio_workq_entry
*entryp
)
1843 if (entryp
== TAILQ_FIRST(&entryp
->procp
->p_aio_activeq
)) {
1848 } /* aio_delay_fsync_request */
1850 static aio_workq_entry
*
1851 aio_create_queue_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
, int kindOfIO
)
1853 aio_workq_entry
*entryp
;
1856 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1857 if ( entryp
== NULL
) {
1862 bzero( entryp
, sizeof(*entryp
) );
1864 /* fill in the rest of the aio_workq_entry */
1865 entryp
->procp
= procp
;
1866 entryp
->uaiocbp
= aiocbp
;
1867 entryp
->flags
|= kindOfIO
;
1868 entryp
->group_tag
= group_tag
;
1869 entryp
->aio_map
= VM_MAP_NULL
;
1870 entryp
->aio_refcount
= 0;
1872 if ( proc_is64bit(procp
) ) {
1873 struct user64_aiocb aiocb64
;
1875 result
= copyin( aiocbp
, &aiocb64
, sizeof(aiocb64
) );
1877 do_munge_aiocb_user64_to_user(&aiocb64
, &entryp
->aiocb
);
1880 struct user32_aiocb aiocb32
;
1882 result
= copyin( aiocbp
, &aiocb32
, sizeof(aiocb32
) );
1884 do_munge_aiocb_user32_to_user( &aiocb32
, &entryp
->aiocb
);
1887 if ( result
!= 0 ) {
1892 /* get a reference to the user land map in order to keep it around */
1893 entryp
->aio_map
= get_task_map( procp
->task
);
1894 vm_map_reference( entryp
->aio_map
);
1896 /* do some more validation on the aiocb and embedded file descriptor */
1897 result
= aio_validate( entryp
);
1899 goto error_exit_with_ref
;
1901 /* get a reference on the current_thread, which is passed in vfs_context. */
1902 entryp
->thread
= current_thread();
1903 thread_reference( entryp
->thread
);
1906 error_exit_with_ref
:
1907 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
1908 vm_map_deallocate( entryp
->aio_map
);
1911 if ( result
&& entryp
!= NULL
) {
1912 zfree( aio_workq_zonep
, entryp
);
1921 * aio_queue_async_request - queue up an async IO request on our work queue then
1922 * wake up one of our worker threads to do the actual work. We get a reference
1923 * to our caller's user land map in order to keep it around while we are
1924 * processing the request.
1927 aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
)
1929 aio_workq_entry
*entryp
;
1933 old_count
= aio_increment_total_count();
1934 if (old_count
>= aio_max_requests
) {
1939 entryp
= aio_create_queue_entry( procp
, aiocbp
, 0, kindOfIO
);
1940 if ( entryp
== NULL
) {
1946 aio_proc_lock_spin(procp
);
1948 if ( is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1953 /* check our aio limits to throttle bad or rude user land behavior */
1954 if (aio_get_process_count( procp
) >= aio_max_requests_per_process
) {
1955 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp
->p_aio_total_count
);
1960 /* Add the IO to proc and work queues, wake up threads as appropriate */
1961 lck_mtx_convert_spin(aio_proc_mutex(procp
));
1962 aio_enqueue_work(procp
, entryp
, 1);
1964 aio_proc_unlock(procp
);
1966 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1967 (int)procp
, (int)aiocbp
, 0, 0, 0 );
1973 * This entry has not been queued up so no worries about
1974 * unlocked state and aio_map
1976 aio_proc_unlock(procp
);
1977 aio_free_request(entryp
);
1980 aio_decrement_total_count();
1984 } /* aio_queue_async_request */
1990 * Allocate an aio_workq_entry and fill it in. If all goes well return 0
1991 * and pass the aio_workq_entry pointer back to our caller.
1993 * Parameters: procp The process makign the request
1994 * aiocbp The aio context buffer pointer
1995 * group_tag The group tag used to indicate a
1996 * group of operations has completed
1997 * entrypp Pointer to the pointer to receive the
1998 * address of the created aio_workq_entry
2000 * Returns: 0 Successfully created
2001 * EAGAIN Try again (usually resource shortage)
2004 * Notes: We get a reference to our caller's user land map in order
2005 * to keep it around while we are processing the request.
2007 * lio_listio calls behave differently at completion they do
2008 * completion notification when all async IO requests have
2009 * completed. We use group_tag to tag IO requests that behave
2010 * in the delay notification manner.
2012 * All synchronous operations are considered to not have a
2013 * signal routine associated with them (sigp == USER_ADDR_NULL).
2016 lio_create_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
,
2017 aio_workq_entry
**entrypp
)
2019 aio_workq_entry
*entryp
;
2022 entryp
= aio_create_queue_entry( procp
, aiocbp
, group_tag
, AIO_LIO
);
2023 if ( entryp
== NULL
) {
2029 * Look for lio_listio LIO_NOP requests and ignore them; this is
2030 * not really an error, but we need to free our aio_workq_entry.
2032 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
2042 if ( entryp
!= NULL
) {
2044 * This entry has not been queued up so no worries about
2045 * unlocked state and aio_map
2047 aio_free_request(entryp
);
2052 } /* lio_create_entry */
2056 * aio_free_request - remove our reference on the user land map and
2057 * free the work queue entry resources. The entry is off all lists
2058 * and has zero refcount, so no one can have a pointer to it.
2062 aio_free_request(aio_workq_entry
*entryp
)
2064 /* remove our reference to the user land map. */
2065 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
2066 vm_map_deallocate(entryp
->aio_map
);
2069 /* remove our reference to thread which enqueued the request */
2070 if ( NULL
!= entryp
->thread
) {
2071 thread_deallocate( entryp
->thread
);
2074 entryp
->aio_refcount
= -1; /* A bit of poisoning in case of bad refcounting. */
2076 zfree( aio_workq_zonep
, entryp
);
2080 } /* aio_free_request */
2086 * validate the aiocb passed in by one of the aio syscalls.
2089 aio_validate( aio_workq_entry
*entryp
)
2091 struct fileproc
*fp
;
2097 if ( (entryp
->flags
& AIO_LIO
) != 0 ) {
2098 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_READ
)
2099 entryp
->flags
|= AIO_READ
;
2100 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_WRITE
)
2101 entryp
->flags
|= AIO_WRITE
;
2102 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
)
2109 if ( (entryp
->flags
& (AIO_WRITE
| AIO_FSYNC
| AIO_DSYNC
)) != 0 ) {
2113 if ( (entryp
->flags
& (AIO_READ
| AIO_WRITE
)) != 0 ) {
2114 if ( entryp
->aiocb
.aio_nbytes
> INT_MAX
||
2115 entryp
->aiocb
.aio_buf
== USER_ADDR_NULL
||
2116 entryp
->aiocb
.aio_offset
< 0 )
2121 * validate aiocb.aio_sigevent. at this point we only support
2122 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
2123 * sigev_value, sigev_notify_function, and sigev_notify_attributes
2124 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
2125 * with no [RTS] (RalTime Signal) option group support.
2127 switch ( entryp
->aiocb
.aio_sigevent
.sigev_notify
) {
2132 /* make sure we have a valid signal number */
2133 signum
= entryp
->aiocb
.aio_sigevent
.sigev_signo
;
2134 if ( signum
<= 0 || signum
>= NSIG
||
2135 signum
== SIGKILL
|| signum
== SIGSTOP
)
2144 /* Unsupported [RTS] */
2150 /* validate the file descriptor and that the file was opened
2151 * for the appropriate read / write access.
2153 proc_fdlock(entryp
->procp
);
2155 result
= fp_lookup( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 1);
2156 if ( result
== 0 ) {
2157 if ( (fp
->f_fglob
->fg_flag
& flag
) == 0 ) {
2158 /* we don't have read or write access */
2161 else if ( FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_VNODE
) {
2162 /* this is not a file */
2165 fp
->f_flags
|= FP_AIOISSUED
;
2167 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 1);
2173 proc_fdunlock(entryp
->procp
);
2177 } /* aio_validate */
2180 aio_increment_total_count()
2182 return OSIncrementAtomic(&aio_anchor
.aio_total_count
);
2186 aio_decrement_total_count()
2188 int old
= OSDecrementAtomic(&aio_anchor
.aio_total_count
);
2190 panic("Negative total AIO count!\n");
2197 aio_get_process_count(proc_t procp
)
2199 return procp
->p_aio_total_count
;
2201 } /* aio_get_process_count */
2204 aio_get_all_queues_count( void )
2206 return aio_anchor
.aio_total_count
;
2208 } /* aio_get_all_queues_count */
2212 * do_aio_completion. Handle async IO completion.
2215 do_aio_completion( aio_workq_entry
*entryp
)
2218 boolean_t lastLioCompleted
= FALSE
;
2219 aio_lio_context
*lio_context
= NULL
;
2222 lio_context
= (aio_lio_context
*)entryp
->group_tag
;
2224 if (lio_context
!= NULL
) {
2226 aio_proc_lock_spin(entryp
->procp
);
2228 /* Account for this I/O completing. */
2229 lio_context
->io_completed
++;
2231 /* Are we done with this lio context? */
2232 if (lio_context
->io_issued
== lio_context
->io_completed
) {
2233 lastLioCompleted
= TRUE
;
2236 waiter
= lio_context
->io_waiter
;
2238 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2239 if ((entryp
->flags
& AIO_LIO_NOTIFY
) && (lastLioCompleted
) && (waiter
!= 0)) {
2240 /* wake up the waiter */
2241 wakeup(lio_context
);
2244 aio_proc_unlock(entryp
->procp
);
2247 if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
== SIGEV_SIGNAL
&&
2248 (entryp
->flags
& AIO_DISABLE
) == 0 ) {
2250 boolean_t performSignal
= FALSE
;
2251 if (lio_context
== NULL
) {
2252 performSignal
= TRUE
;
2256 * If this was the last request in the group and a signal
2257 * is desired, send one.
2259 performSignal
= lastLioCompleted
;
2262 if (performSignal
) {
2264 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_sig
)) | DBG_FUNC_NONE
,
2265 (int)entryp
->procp
, (int)entryp
->uaiocbp
,
2266 entryp
->aiocb
.aio_sigevent
.sigev_signo
, 0, 0 );
2268 psignal( entryp
->procp
, entryp
->aiocb
.aio_sigevent
.sigev_signo
);
2272 if ((entryp
->flags
& AIO_EXIT_WAIT
) && (entryp
->flags
& AIO_CLOSE_WAIT
)) {
2273 panic("Close and exit flags set at the same time\n");
2277 * need to handle case where a process is trying to exit, exec, or
2278 * close and is currently waiting for active aio requests to complete.
2279 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2280 * other requests in the active queue for this process. If there are
2281 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2282 * If there are some still active then do nothing - we only want to
2283 * wakeup when all active aio requests for the process are complete.
2285 * Don't need to lock the entry or proc to check the cleanup flag. It can only be
2286 * set for cancellation, while the entryp is still on a proc list; now it's
2287 * off, so that flag is already set if it's going to be.
2289 if ( (entryp
->flags
& AIO_EXIT_WAIT
) != 0 ) {
2290 int active_requests
;
2292 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2293 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2295 aio_proc_lock_spin(entryp
->procp
);
2296 active_requests
= aio_active_requests_for_process( entryp
->procp
);
2297 if ( active_requests
< 1 ) {
2299 * no active aio requests for this process, continue exiting. In this
2300 * case, there should be no one else waiting ont he proc in AIO...
2302 wakeup_one((caddr_t
)&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2303 aio_proc_unlock(entryp
->procp
);
2305 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2306 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2308 aio_proc_unlock(entryp
->procp
);
2312 if ( (entryp
->flags
& AIO_CLOSE_WAIT
) != 0 ) {
2313 int active_requests
;
2315 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2316 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2318 aio_proc_lock_spin(entryp
->procp
);
2319 active_requests
= aio_proc_active_requests_for_file( entryp
->procp
, entryp
->aiocb
.aio_fildes
);
2320 if ( active_requests
< 1 ) {
2321 /* Can't wakeup_one(); multiple closes might be in progress. */
2322 wakeup(&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2323 aio_proc_unlock(entryp
->procp
);
2325 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2326 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2328 aio_proc_unlock(entryp
->procp
);
2332 * A thread in aio_suspend() wants to known about completed IOs. If it checked
2333 * the done list before we moved our AIO there, then it already asserted its wait,
2334 * and we can wake it up without holding the lock. If it checked the list after
2335 * we did our move, then it already has seen the AIO that we moved. Herego, we
2336 * can do our wakeup without holding the lock.
2338 wakeup( (caddr_t
) &entryp
->procp
->AIO_SUSPEND_SLEEP_CHAN
);
2339 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_suspend_wake
)) | DBG_FUNC_NONE
,
2340 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2343 * free the LIO context if the last lio completed and no thread is
2346 if (lastLioCompleted
&& (waiter
== 0))
2347 free_lio_context (lio_context
);
2350 } /* do_aio_completion */
2357 do_aio_read( aio_workq_entry
*entryp
)
2359 struct fileproc
*fp
;
2361 struct vfs_context context
;
2363 if ( (error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0)) )
2365 if ( (fp
->f_fglob
->fg_flag
& FREAD
) == 0 ) {
2366 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2370 context
.vc_thread
= entryp
->thread
; /* XXX */
2371 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2373 error
= dofileread(&context
, fp
,
2374 entryp
->aiocb
.aio_buf
,
2375 entryp
->aiocb
.aio_nbytes
,
2376 entryp
->aiocb
.aio_offset
, FOF_OFFSET
,
2377 &entryp
->returnval
);
2378 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2389 do_aio_write( aio_workq_entry
*entryp
)
2391 struct fileproc
*fp
;
2393 struct vfs_context context
;
2395 if ( (error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0)) )
2397 if ( (fp
->f_fglob
->fg_flag
& FWRITE
) == 0 ) {
2398 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2403 if ( (fp
->f_fglob
->fg_flag
& O_APPEND
) == 0 ) {
2404 flags
|= FOF_OFFSET
;
2407 context
.vc_thread
= entryp
->thread
; /* XXX */
2408 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2410 /* NB: tell dofilewrite the offset, and to use the proc cred */
2411 error
= dofilewrite(&context
,
2413 entryp
->aiocb
.aio_buf
,
2414 entryp
->aiocb
.aio_nbytes
,
2415 entryp
->aiocb
.aio_offset
,
2417 &entryp
->returnval
);
2419 if (entryp
->returnval
)
2420 fp_drop_written(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
);
2422 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2426 } /* do_aio_write */
2430 * aio_active_requests_for_process - return number of active async IO
2431 * requests for the given process.
2434 aio_active_requests_for_process(proc_t procp
)
2436 return( procp
->p_aio_active_count
);
2438 } /* aio_active_requests_for_process */
2441 * Called with the proc locked.
2444 aio_proc_active_requests_for_file(proc_t procp
, int fd
)
2447 aio_workq_entry
*entryp
;
2448 TAILQ_FOREACH(entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2449 if (entryp
->aiocb
.aio_fildes
== fd
) {
2455 } /* aio_active_requests_for_process */
2463 do_aio_fsync( aio_workq_entry
*entryp
)
2465 struct vfs_context context
;
2467 struct fileproc
*fp
;
2472 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2474 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2475 * to mark for update the metadata not strictly necessary for data
2476 * retrieval, rather than forcing it to disk.
2478 * If AIO_FSYNC is set, we have to also wait for metadata not really
2479 * necessary to data retrival are committed to stable storage (e.g.
2480 * atime, mtime, ctime, etc.).
2482 * Metadata necessary for data retrieval ust be committed to stable
2483 * storage in either case (file length, etc.).
2485 if (entryp
->flags
& AIO_FSYNC
)
2486 sync_flag
= MNT_WAIT
;
2488 sync_flag
= MNT_DWAIT
;
2490 error
= fp_getfvp( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, &vp
);
2492 if ( (error
= vnode_getwithref(vp
)) ) {
2493 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2494 entryp
->returnval
= -1;
2497 context
.vc_thread
= current_thread();
2498 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2500 error
= VNOP_FSYNC( vp
, sync_flag
, &context
);
2502 (void)vnode_put(vp
);
2504 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2507 entryp
->returnval
= -1;
2511 } /* do_aio_fsync */
2515 * is_already_queued - runs through our queues to see if the given
2516 * aiocbp / process is there. Returns TRUE if there is a match
2517 * on any of our aio queues.
2519 * Called with proc aio lock held (can be held spin)
2522 is_already_queued(proc_t procp
,
2523 user_addr_t aiocbp
)
2525 aio_workq_entry
*entryp
;
2530 /* look for matches on our queue of async IO requests that have completed */
2531 TAILQ_FOREACH( entryp
, &procp
->p_aio_doneq
, aio_proc_link
) {
2532 if ( aiocbp
== entryp
->uaiocbp
) {
2534 goto ExitThisRoutine
;
2538 /* look for matches on our queue of active async IO requests */
2539 TAILQ_FOREACH( entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2540 if ( aiocbp
== entryp
->uaiocbp
) {
2542 goto ExitThisRoutine
;
2549 } /* is_already_queued */
2553 free_lio_context(aio_lio_context
* context
)
2557 OSDecrementAtomic(&lio_contexts_alloced
);
2560 FREE( context
, M_TEMP
);
2562 } /* free_lio_context */
2566 * aio initialization
2568 __private_extern__
void
2573 aio_lock_grp_attr
= lck_grp_attr_alloc_init();
2574 aio_proc_lock_grp
= lck_grp_alloc_init("aio_proc", aio_lock_grp_attr
);;
2575 aio_entry_lock_grp
= lck_grp_alloc_init("aio_entry", aio_lock_grp_attr
);;
2576 aio_queue_lock_grp
= lck_grp_alloc_init("aio_queue", aio_lock_grp_attr
);;
2577 aio_lock_attr
= lck_attr_alloc_init();
2579 lck_mtx_init(&aio_entry_mtx
, aio_entry_lock_grp
, aio_lock_attr
);
2580 lck_mtx_init(&aio_proc_mtx
, aio_proc_lock_grp
, aio_lock_attr
);
2582 aio_anchor
.aio_inflight_count
= 0;
2583 aio_anchor
.aio_done_count
= 0;
2584 aio_anchor
.aio_total_count
= 0;
2585 aio_anchor
.aio_num_workqs
= AIO_NUM_WORK_QUEUES
;
2587 for (i
= 0; i
< AIO_NUM_WORK_QUEUES
; i
++) {
2588 aio_workq_init(&aio_anchor
.aio_async_workqs
[i
]);
2592 i
= sizeof( aio_workq_entry
);
2593 aio_workq_zonep
= zinit( i
, i
* aio_max_requests
, i
* aio_max_requests
, "aiowq" );
2595 _aio_create_worker_threads( aio_worker_threads
);
2601 * aio worker threads created here.
2603 __private_extern__
void
2604 _aio_create_worker_threads( int num
)
2608 /* create some worker threads to handle the async IO requests */
2609 for ( i
= 0; i
< num
; i
++ ) {
2612 if ( KERN_SUCCESS
!= kernel_thread_start((thread_continue_t
)aio_work_thread
, NULL
, &myThread
) ) {
2613 printf( "%s - failed to create a work thread \n", __FUNCTION__
);
2616 thread_deallocate(myThread
);
2621 } /* _aio_create_worker_threads */
2624 * Return the current activation utask
2629 return ((struct uthread
*)get_bsdthread_info(current_thread()))->uu_aio_task
;
2634 * In the case of an aiocb from a
2635 * 32-bit process we need to expand some longs and pointers to the correct
2636 * sizes in order to let downstream code always work on the same type of
2637 * aiocb (in our case that is a user_aiocb)
2640 do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2642 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2643 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2644 the_user_aiocbp
->aio_buf
= CAST_USER_ADDR_T(my_aiocbp
->aio_buf
);
2645 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2646 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2647 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2649 /* special case here. since we do not know if sigev_value is an */
2650 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2651 /* means if we send this info back to user space we need to remember */
2652 /* sigev_value was not expanded for the 32-bit case. */
2653 /* NOTE - this does NOT affect us since we don't support sigev_value */
2654 /* yet in the aio context. */
2656 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2657 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2658 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2659 my_aiocbp
->aio_sigevent
.sigev_value
.sival_int
;
2660 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2661 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_function
);
2662 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2663 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_attributes
);
2666 /* Similar for 64-bit user process, so that we don't need to satisfy
2667 * the alignment constraints of the original user64_aiocb
2670 do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2672 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2673 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2674 the_user_aiocbp
->aio_buf
= my_aiocbp
->aio_buf
;
2675 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2676 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2677 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2679 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2680 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2681 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2682 my_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
;
2683 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2684 my_aiocbp
->aio_sigevent
.sigev_notify_function
;
2685 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2686 my_aiocbp
->aio_sigevent
.sigev_notify_attributes
;