2 * Copyright (c) 2003-2016 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
62 #include <machine/limits.h>
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/waitq.h>
67 #include <kern/zalloc.h>
68 #include <kern/task.h>
69 #include <kern/sched_prim.h>
71 #include <vm/vm_map.h>
73 #include <libkern/OSAtomic.h>
75 #include <sys/kdebug.h>
76 #define AIO_work_queued 1
77 #define AIO_worker_wake 2
78 #define AIO_completion_sig 3
79 #define AIO_completion_cleanup_wait 4
80 #define AIO_completion_cleanup_wake 5
81 #define AIO_completion_suspend_wake 6
82 #define AIO_fsync_delay 7
84 #define AIO_cancel_async_workq 11
85 #define AIO_cancel_sync_workq 12
86 #define AIO_cancel_activeq 13
87 #define AIO_cancel_doneq 14
93 #define AIO_error_val 61
94 #define AIO_error_activeq 62
95 #define AIO_error_workq 63
97 #define AIO_return_val 71
98 #define AIO_return_activeq 72
99 #define AIO_return_workq 73
102 #define AIO_exit_sleep 91
103 #define AIO_close 100
104 #define AIO_close_sleep 101
105 #define AIO_suspend 110
106 #define AIO_suspend_sleep 111
107 #define AIO_worker_thread 120
111 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
115 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
116 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
117 * (proc.aio_activeq) when one of our worker threads start the IO.
118 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
119 * when the IO request completes. The request remains on aio_doneq until
120 * user process calls aio_return or the process exits, either way that is our
121 * trigger to release aio resources.
123 typedef struct aio_workq
{
124 TAILQ_HEAD(, aio_workq_entry
) aioq_entries
;
127 struct waitq aioq_waitq
;
130 #define AIO_NUM_WORK_QUEUES 1
133 volatile int32_t aio_inflight_count
; /* entries that have been taken from a workq */
134 volatile int32_t aio_done_count
; /* entries on all done queues (proc.aio_doneq) */
135 volatile int32_t aio_total_count
; /* total extant entries */
137 /* Hash table of queues here */
139 struct aio_workq aio_async_workqs
[AIO_NUM_WORK_QUEUES
];
141 typedef struct aio_anchor_cb aio_anchor_cb
;
143 struct aio_lio_context
149 typedef struct aio_lio_context aio_lio_context
;
153 * Notes on aio sleep / wake channels.
154 * We currently pick a couple fields within the proc structure that will allow
155 * us sleep channels that currently do not collide with any other kernel routines.
156 * At this time, for binary compatibility reasons, we cannot create new proc fields.
158 #define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
159 #define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
161 #define ASSERT_AIO_FROM_PROC(aiop, theproc) \
162 if ((aiop)->procp != (theproc)) { \
163 panic("AIO on a proc list that does not belong to that proc.\n"); \
169 static void aio_proc_lock(proc_t procp
);
170 static void aio_proc_lock_spin(proc_t procp
);
171 static void aio_proc_unlock(proc_t procp
);
172 static lck_mtx_t
* aio_proc_mutex(proc_t procp
);
173 static void aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
174 static void aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
175 static int aio_get_process_count(proc_t procp
);
176 static int aio_active_requests_for_process(proc_t procp
);
177 static int aio_proc_active_requests_for_file(proc_t procp
, int fd
);
178 static boolean_t
is_already_queued(proc_t procp
, user_addr_t aiocbp
);
179 static boolean_t
should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
);
181 static void aio_entry_lock(aio_workq_entry
*entryp
);
182 static void aio_entry_lock_spin(aio_workq_entry
*entryp
);
183 static aio_workq_t
aio_entry_workq(aio_workq_entry
*entryp
);
184 static lck_mtx_t
* aio_entry_mutex(__unused aio_workq_entry
*entryp
);
185 static void aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
186 static void aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
187 static void aio_entry_ref_locked(aio_workq_entry
*entryp
);
188 static void aio_entry_unref_locked(aio_workq_entry
*entryp
);
189 static void aio_entry_ref(aio_workq_entry
*entryp
);
190 static void aio_entry_unref(aio_workq_entry
*entryp
);
191 static void aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
,
192 int wait_for_completion
, boolean_t disable_notification
);
193 static int aio_entry_try_workq_remove(aio_workq_entry
*entryp
);
194 static boolean_t
aio_delay_fsync_request( aio_workq_entry
*entryp
);
195 static int aio_free_request(aio_workq_entry
*entryp
);
197 static void aio_workq_init(aio_workq_t wq
);
198 static void aio_workq_lock_spin(aio_workq_t wq
);
199 static void aio_workq_unlock(aio_workq_t wq
);
200 static lck_mtx_t
* aio_workq_mutex(aio_workq_t wq
);
202 static void aio_work_thread( void );
203 static aio_workq_entry
*aio_get_some_work( void );
205 static int aio_get_all_queues_count( void );
206 static int aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
);
207 static int aio_validate( aio_workq_entry
*entryp
);
208 static int aio_increment_total_count(void);
209 static int aio_decrement_total_count(void);
211 static int do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
, int wait_for_completion
, boolean_t disable_notification
);
212 static void do_aio_completion( aio_workq_entry
*entryp
);
213 static int do_aio_fsync( aio_workq_entry
*entryp
);
214 static int do_aio_read( aio_workq_entry
*entryp
);
215 static int do_aio_write( aio_workq_entry
*entryp
);
216 static void do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
217 static void do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
218 static int lio_create_entry(proc_t procp
,
221 aio_workq_entry
**entrypp
);
222 static aio_workq_entry
*aio_create_queue_entry(proc_t procp
,
226 static user_addr_t
*aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
);
227 static void free_lio_context(aio_lio_context
* context
);
228 static void aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
);
230 #define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
231 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
232 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
235 * EXTERNAL PROTOTYPES
238 /* in ...bsd/kern/sys_generic.c */
239 extern int dofileread(vfs_context_t ctx
, struct fileproc
*fp
,
240 user_addr_t bufp
, user_size_t nbyte
,
241 off_t offset
, int flags
, user_ssize_t
*retval
);
242 extern int dofilewrite(vfs_context_t ctx
, struct fileproc
*fp
,
243 user_addr_t bufp
, user_size_t nbyte
, off_t offset
,
244 int flags
, user_ssize_t
*retval
);
246 static uint32_t lio_contexts_alloced
= 0;
250 * aio external global variables.
252 extern int aio_max_requests
; /* AIO_MAX - configurable */
253 extern int aio_max_requests_per_process
; /* AIO_PROCESS_MAX - configurable */
254 extern int aio_worker_threads
; /* AIO_THREAD_COUNT - configurable */
258 * aio static variables.
260 static aio_anchor_cb aio_anchor
;
261 static lck_grp_t
*aio_proc_lock_grp
;
262 static lck_grp_t
*aio_entry_lock_grp
;
263 static lck_grp_t
*aio_queue_lock_grp
;
264 static lck_attr_t
*aio_lock_attr
;
265 static lck_grp_attr_t
*aio_lock_grp_attr
;
266 static struct zone
*aio_workq_zonep
;
267 static lck_mtx_t aio_entry_mtx
;
268 static lck_mtx_t aio_proc_mtx
;
271 aio_entry_lock(__unused aio_workq_entry
*entryp
)
273 lck_mtx_lock(&aio_entry_mtx
);
277 aio_entry_lock_spin(__unused aio_workq_entry
*entryp
)
279 lck_mtx_lock_spin(&aio_entry_mtx
);
283 aio_entry_unlock(__unused aio_workq_entry
*entryp
)
285 lck_mtx_unlock(&aio_entry_mtx
);
290 aio_entry_workq(__unused aio_workq_entry
*entryp
)
292 return &aio_anchor
.aio_async_workqs
[0];
296 aio_entry_mutex(__unused aio_workq_entry
*entryp
)
298 return &aio_entry_mtx
;
302 aio_workq_init(aio_workq_t wq
)
304 TAILQ_INIT(&wq
->aioq_entries
);
306 lck_mtx_init(&wq
->aioq_mtx
, aio_queue_lock_grp
, aio_lock_attr
);
307 waitq_init(&wq
->aioq_waitq
, SYNC_POLICY_FIFO
);
312 * Can be passed a queue which is locked spin.
315 aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
317 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
319 if (entryp
->aio_workq_link
.tqe_prev
== NULL
) {
320 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
323 TAILQ_REMOVE(&queue
->aioq_entries
, entryp
, aio_workq_link
);
325 entryp
->aio_workq_link
.tqe_prev
= NULL
; /* Not on a workq */
327 if (queue
->aioq_count
< 0) {
328 panic("Negative count on a queue.\n");
333 aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
335 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
337 TAILQ_INSERT_TAIL(&queue
->aioq_entries
, entryp
, aio_workq_link
);
338 if (queue
->aioq_count
< 0) {
339 panic("Negative count on a queue.\n");
345 aio_proc_lock(proc_t procp
)
347 lck_mtx_lock(aio_proc_mutex(procp
));
351 aio_proc_lock_spin(proc_t procp
)
353 lck_mtx_lock_spin(aio_proc_mutex(procp
));
357 aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
359 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
361 TAILQ_REMOVE(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
362 TAILQ_INSERT_TAIL( &procp
->p_aio_doneq
, entryp
, aio_proc_link
);
363 procp
->p_aio_active_count
--;
364 OSIncrementAtomic(&aio_anchor
.aio_done_count
);
368 aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
370 TAILQ_REMOVE(&procp
->p_aio_doneq
, entryp
, aio_proc_link
);
371 OSDecrementAtomic(&aio_anchor
.aio_done_count
);
372 aio_decrement_total_count();
373 procp
->p_aio_total_count
--;
377 aio_proc_unlock(proc_t procp
)
379 lck_mtx_unlock(aio_proc_mutex(procp
));
383 aio_proc_mutex(proc_t procp
)
385 return &procp
->p_mlock
;
389 aio_entry_ref_locked(aio_workq_entry
*entryp
)
391 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
393 if (entryp
->aio_refcount
< 0) {
394 panic("AIO workq entry with a negative refcount.\n");
396 entryp
->aio_refcount
++;
400 /* Return 1 if you've freed it */
402 aio_entry_unref_locked(aio_workq_entry
*entryp
)
404 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
406 entryp
->aio_refcount
--;
407 if (entryp
->aio_refcount
< 0) {
408 panic("AIO workq entry with a negative refcount.\n");
413 aio_entry_ref(aio_workq_entry
*entryp
)
415 aio_entry_lock_spin(entryp
);
416 aio_entry_ref_locked(entryp
);
417 aio_entry_unlock(entryp
);
420 aio_entry_unref(aio_workq_entry
*entryp
)
422 aio_entry_lock_spin(entryp
);
423 aio_entry_unref_locked(entryp
);
425 if ((entryp
->aio_refcount
== 0) && ((entryp
->flags
& AIO_DO_FREE
) != 0)) {
426 aio_entry_unlock(entryp
);
427 aio_free_request(entryp
);
429 aio_entry_unlock(entryp
);
436 aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
, int wait_for_completion
, boolean_t disable_notification
)
438 aio_entry_lock_spin(entryp
);
441 aio_entry_ref_locked(entryp
);
442 entryp
->errorval
= ECANCELED
;
443 entryp
->returnval
= -1;
446 if ( wait_for_completion
) {
447 entryp
->flags
|= wait_for_completion
; /* flag for special completion processing */
450 if ( disable_notification
) {
451 entryp
->flags
|= AIO_DISABLE
; /* Don't want a signal */
454 aio_entry_unlock(entryp
);
458 aio_entry_try_workq_remove(aio_workq_entry
*entryp
)
460 /* Can only be cancelled if it's still on a work queue */
461 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
464 /* Will have to check again under the lock */
465 queue
= aio_entry_workq(entryp
);
466 aio_workq_lock_spin(queue
);
467 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
468 aio_workq_remove_entry_locked(queue
, entryp
);
469 aio_workq_unlock(queue
);
472 aio_workq_unlock(queue
);
480 aio_workq_lock_spin(aio_workq_t wq
)
482 lck_mtx_lock_spin(aio_workq_mutex(wq
));
486 aio_workq_unlock(aio_workq_t wq
)
488 lck_mtx_unlock(aio_workq_mutex(wq
));
492 aio_workq_mutex(aio_workq_t wq
)
494 return &wq
->aioq_mtx
;
498 * aio_cancel - attempt to cancel one or more async IO requests currently
499 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
500 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
501 * is NULL then all outstanding async IO request for the given file
502 * descriptor are cancelled (if possible).
505 aio_cancel(proc_t p
, struct aio_cancel_args
*uap
, int *retval
)
507 struct user_aiocb my_aiocb
;
510 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_START
,
511 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
513 /* quick check to see if there are any async IO requests queued up */
514 if (aio_get_all_queues_count() < 1) {
516 *retval
= AIO_ALLDONE
;
521 if ( uap
->aiocbp
!= USER_ADDR_NULL
) {
522 if ( proc_is64bit(p
) ) {
523 struct user64_aiocb aiocb64
;
525 result
= copyin( uap
->aiocbp
, &aiocb64
, sizeof(aiocb64
) );
527 do_munge_aiocb_user64_to_user(&aiocb64
, &my_aiocb
);
530 struct user32_aiocb aiocb32
;
532 result
= copyin( uap
->aiocbp
, &aiocb32
, sizeof(aiocb32
) );
534 do_munge_aiocb_user32_to_user( &aiocb32
, &my_aiocb
);
542 /* NOTE - POSIX standard says a mismatch between the file */
543 /* descriptor passed in and the file descriptor embedded in */
544 /* the aiocb causes unspecified results. We return EBADF in */
545 /* that situation. */
546 if ( uap
->fd
!= my_aiocb
.aio_fildes
) {
553 result
= do_aio_cancel_locked( p
, uap
->fd
, uap
->aiocbp
, 0, FALSE
);
554 ASSERT_AIO_PROC_LOCK_OWNED(p
);
557 if ( result
!= -1 ) {
566 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_END
,
567 (int)p
, (int)uap
->aiocbp
, result
, 0, 0 );
575 * _aio_close - internal function used to clean up async IO requests for
576 * a file descriptor that is closing.
579 __private_extern__
void
580 _aio_close(proc_t p
, int fd
)
584 /* quick check to see if there are any async IO requests queued up */
585 if (aio_get_all_queues_count() < 1) {
589 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_START
,
590 (int)p
, fd
, 0, 0, 0 );
592 /* cancel all async IO requests on our todo queues for this file descriptor */
594 error
= do_aio_cancel_locked( p
, fd
, 0, AIO_CLOSE_WAIT
, FALSE
);
595 ASSERT_AIO_PROC_LOCK_OWNED(p
);
596 if ( error
== AIO_NOTCANCELED
) {
598 * AIO_NOTCANCELED is returned when we find an aio request for this process
599 * and file descriptor on the active async IO queue. Active requests cannot
600 * be cancelled so we must wait for them to complete. We will get a special
601 * wake up call on our channel used to sleep for ALL active requests to
602 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
603 * when we must wait for all active aio requests.
606 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close_sleep
)) | DBG_FUNC_NONE
,
607 (int)p
, fd
, 0, 0, 0 );
609 while (aio_proc_active_requests_for_file(p
, fd
) > 0) {
610 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
, "aio_close", 0 );
617 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_END
,
618 (int)p
, fd
, 0, 0, 0 );
626 * aio_error - return the error status associated with the async IO
627 * request referred to by uap->aiocbp. The error status is the errno
628 * value that would be set by the corresponding IO request (read, wrtie,
629 * fdatasync, or sync).
632 aio_error(proc_t p
, struct aio_error_args
*uap
, int *retval
)
634 aio_workq_entry
*entryp
;
637 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_START
,
638 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
640 /* see if there are any aios to check */
641 if (aio_get_all_queues_count() < 1) {
647 /* look for a match on our queue of async IO requests that have completed */
648 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
649 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
650 ASSERT_AIO_FROM_PROC(entryp
, p
);
652 aio_entry_lock_spin(entryp
);
653 *retval
= entryp
->errorval
;
655 aio_entry_unlock(entryp
);
656 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_val
)) | DBG_FUNC_NONE
,
657 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
662 /* look for a match on our queue of active async IO requests */
663 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
664 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
665 ASSERT_AIO_FROM_PROC(entryp
, p
);
666 *retval
= EINPROGRESS
;
668 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_activeq
)) | DBG_FUNC_NONE
,
669 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
677 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_END
,
678 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
687 * aio_fsync - asynchronously force all IO operations associated
688 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
689 * queued at the time of the call to the synchronized completion state.
690 * NOTE - we do not support op O_DSYNC at this point since we do not support the
694 aio_fsync(proc_t p
, struct aio_fsync_args
*uap
, int *retval
)
699 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_START
,
700 (int)p
, (int)uap
->aiocbp
, uap
->op
, 0, 0 );
703 /* 0 := O_SYNC for binary backward compatibility with Panther */
704 if (uap
->op
== O_SYNC
|| uap
->op
== 0)
705 fsync_kind
= AIO_FSYNC
;
706 else if ( uap
->op
== O_DSYNC
)
707 fsync_kind
= AIO_DSYNC
;
714 error
= aio_queue_async_request( p
, uap
->aiocbp
, fsync_kind
);
719 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_END
,
720 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
727 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
728 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
729 * (uap->aiocbp->aio_buf).
732 aio_read(proc_t p
, struct aio_read_args
*uap
, int *retval
)
736 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_START
,
737 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
741 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_READ
);
745 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_END
,
746 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
754 * aio_return - return the return status associated with the async IO
755 * request referred to by uap->aiocbp. The return status is the value
756 * that would be returned by corresponding IO request (read, write,
757 * fdatasync, or sync). This is where we release kernel resources
758 * held for async IO call associated with the given aiocb pointer.
761 aio_return(proc_t p
, struct aio_return_args
*uap
, user_ssize_t
*retval
)
763 aio_workq_entry
*entryp
;
765 boolean_t proc_lock_held
= FALSE
;
767 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_START
,
768 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
770 /* See if there are any entries to check */
771 if (aio_get_all_queues_count() < 1) {
777 proc_lock_held
= TRUE
;
780 /* look for a match on our queue of async IO requests that have completed */
781 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
782 ASSERT_AIO_FROM_PROC(entryp
, p
);
783 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
784 /* Done and valid for aio_return(), pull it off the list */
785 aio_proc_remove_done_locked(p
, entryp
);
787 /* Drop the proc lock, but keep the entry locked */
788 aio_entry_lock(entryp
);
790 proc_lock_held
= FALSE
;
792 *retval
= entryp
->returnval
;
795 /* No references and off all lists, safe to free */
796 if (entryp
->aio_refcount
== 0) {
797 aio_entry_unlock(entryp
);
798 aio_free_request(entryp
);
801 /* Whoever has the refcount will have to free it */
802 entryp
->flags
|= AIO_DO_FREE
;
803 aio_entry_unlock(entryp
);
807 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_val
)) | DBG_FUNC_NONE
,
808 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
813 /* look for a match on our queue of active async IO requests */
814 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
815 ASSERT_AIO_FROM_PROC(entryp
, p
);
816 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
818 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_activeq
)) | DBG_FUNC_NONE
,
819 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
829 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_END
,
830 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
838 * _aio_exec - internal function used to clean up async IO requests for
839 * a process that is going away due to exec(). We cancel any async IOs
840 * we can and wait for those already active. We also disable signaling
841 * for cancelled or active aio requests that complete.
842 * This routine MAY block!
844 __private_extern__
void
848 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_START
,
849 (int)p
, 0, 0, 0, 0 );
853 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_END
,
854 (int)p
, 0, 0, 0, 0 );
862 * _aio_exit - internal function used to clean up async IO requests for
863 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
864 * we can and wait for those already active. We also disable signaling
865 * for cancelled or active aio requests that complete. This routine MAY block!
867 __private_extern__
void
871 aio_workq_entry
*entryp
;
874 /* quick check to see if there are any async IO requests queued up */
875 if (aio_get_all_queues_count() < 1) {
879 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_START
,
880 (int)p
, 0, 0, 0, 0 );
885 * cancel async IO requests on the todo work queue and wait for those
886 * already active to complete.
888 error
= do_aio_cancel_locked( p
, 0, 0, AIO_EXIT_WAIT
, TRUE
);
889 ASSERT_AIO_PROC_LOCK_OWNED(p
);
890 if ( error
== AIO_NOTCANCELED
) {
892 * AIO_NOTCANCELED is returned when we find an aio request for this process
893 * on the active async IO queue. Active requests cannot be cancelled so we
894 * must wait for them to complete. We will get a special wake up call on
895 * our channel used to sleep for ALL active requests to complete. This sleep
896 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
897 * active aio requests.
900 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit_sleep
)) | DBG_FUNC_NONE
,
901 (int)p
, 0, 0, 0, 0 );
903 while (p
->p_aio_active_count
!= 0) {
904 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
, "aio_exit", 0 );
908 if (p
->p_aio_active_count
!= 0) {
909 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p
->p_aio_active_count
);
912 /* release all aio resources used by this process */
913 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
914 while ( entryp
!= NULL
) {
915 ASSERT_AIO_FROM_PROC(entryp
, p
);
916 aio_workq_entry
*next_entryp
;
918 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
919 aio_proc_remove_done_locked(p
, entryp
);
921 /* we cannot free requests that are still completing */
922 aio_entry_lock_spin(entryp
);
923 if (entryp
->aio_refcount
== 0) {
925 aio_entry_unlock(entryp
);
926 aio_free_request(entryp
);
928 /* need to start over since aio_doneq may have been */
929 /* changed while we were away. */
931 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
935 /* whoever has the reference will have to do the free */
936 entryp
->flags
|= AIO_DO_FREE
;
939 aio_entry_unlock(entryp
);
940 entryp
= next_entryp
;
945 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_END
,
946 (int)p
, 0, 0, 0, 0 );
953 should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
)
955 if ( (aiocbp
== USER_ADDR_NULL
&& fd
== 0) ||
956 (aiocbp
!= USER_ADDR_NULL
&& entryp
->uaiocbp
== aiocbp
) ||
957 (aiocbp
== USER_ADDR_NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
965 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
966 * aio_cancel, close, and at exit.
967 * There are three modes of operation: 1) cancel all async IOs for a process -
968 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
969 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
971 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
972 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
973 * target async IO requests, and AIO_ALLDONE if all target async IO requests
974 * were already complete.
975 * WARNING - do not deference aiocbp in this routine, it may point to user
976 * land data that has not been copied in (when called from aio_cancel() )
978 * Called with proc locked, and returns the same way.
981 do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
,
982 int wait_for_completion
, boolean_t disable_notification
)
984 ASSERT_AIO_PROC_LOCK_OWNED(p
);
986 aio_workq_entry
*entryp
;
991 /* look for a match on our queue of async todo work. */
992 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
993 while ( entryp
!= NULL
) {
994 ASSERT_AIO_FROM_PROC(entryp
, p
);
995 aio_workq_entry
*next_entryp
;
997 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
998 if (!should_cancel(entryp
, aiocbp
, fd
)) {
999 entryp
= next_entryp
;
1003 /* Can only be cancelled if it's still on a work queue */
1004 if (aio_entry_try_workq_remove(entryp
) != 0) {
1005 /* Have removed from workq. Update entry state and take a ref */
1006 aio_entry_update_for_cancel(entryp
, TRUE
, 0, disable_notification
);
1008 /* Put on the proc done queue and update counts, then unlock the proc */
1009 aio_proc_move_done_locked(p
, entryp
);
1012 /* Now it's officially cancelled. Do the completion */
1013 result
= AIO_CANCELED
;
1014 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_async_workq
)) | DBG_FUNC_NONE
,
1015 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1016 do_aio_completion(entryp
);
1018 /* This will free if the aio_return() has already happened ... */
1019 aio_entry_unref(entryp
);
1022 if ( aiocbp
!= USER_ADDR_NULL
) {
1027 * Restart from the head of the proc active queue since it
1028 * may have been changed while we were away doing completion
1031 * Note that if we found an uncancellable AIO before, we will
1032 * either find it again or discover that it's been completed,
1033 * so resetting the result will not cause us to return success
1034 * despite outstanding AIOs.
1036 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
1037 result
= -1; /* As if beginning anew */
1040 * It's been taken off the active queue already, i.e. is in flight.
1041 * All we can do is ask for notification.
1043 result
= AIO_NOTCANCELED
;
1045 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_activeq
)) | DBG_FUNC_NONE
,
1046 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1048 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1049 aio_entry_update_for_cancel(entryp
, FALSE
, wait_for_completion
, disable_notification
);
1051 if ( aiocbp
!= USER_ADDR_NULL
) {
1054 entryp
= next_entryp
;
1059 * if we didn't find any matches on the todo or active queues then look for a
1060 * match on our queue of async IO requests that have completed and if found
1061 * return AIO_ALLDONE result.
1063 * Proc AIO lock is still held.
1065 if ( result
== -1 ) {
1066 TAILQ_FOREACH(entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1067 ASSERT_AIO_FROM_PROC(entryp
, p
);
1068 if (should_cancel(entryp
, aiocbp
, fd
)) {
1069 result
= AIO_ALLDONE
;
1070 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_doneq
)) | DBG_FUNC_NONE
,
1071 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1073 if ( aiocbp
!= USER_ADDR_NULL
) {
1083 /* do_aio_cancel_locked */
1087 * aio_suspend - suspend the calling thread until at least one of the async
1088 * IO operations referenced by uap->aiocblist has completed, until a signal
1089 * interrupts the function, or uap->timeoutp time interval (optional) has
1091 * Returns 0 if one or more async IOs have completed else -1 and errno is
1092 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1096 aio_suspend(proc_t p
, struct aio_suspend_args
*uap
, int *retval
)
1098 __pthread_testcancel(1);
1099 return(aio_suspend_nocancel(p
, (struct aio_suspend_nocancel_args
*)uap
, retval
));
1104 aio_suspend_nocancel(proc_t p
, struct aio_suspend_nocancel_args
*uap
, int *retval
)
1109 struct user_timespec ts
;
1110 aio_workq_entry
*entryp
;
1111 user_addr_t
*aiocbpp
;
1113 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_START
,
1114 (int)p
, uap
->nent
, 0, 0, 0 );
1120 count
= aio_get_all_queues_count( );
1123 goto ExitThisRoutine
;
1126 if ( uap
->nent
< 1 || uap
->nent
> aio_max_requests_per_process
) {
1128 goto ExitThisRoutine
;
1131 if ( uap
->timeoutp
!= USER_ADDR_NULL
) {
1132 if ( proc_is64bit(p
) ) {
1133 struct user64_timespec temp
;
1134 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
) );
1136 ts
.tv_sec
= temp
.tv_sec
;
1137 ts
.tv_nsec
= temp
.tv_nsec
;
1141 struct user32_timespec temp
;
1142 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
) );
1144 ts
.tv_sec
= temp
.tv_sec
;
1145 ts
.tv_nsec
= temp
.tv_nsec
;
1150 goto ExitThisRoutine
;
1153 if ( ts
.tv_sec
< 0 || ts
.tv_nsec
< 0 || ts
.tv_nsec
>= 1000000000 ) {
1155 goto ExitThisRoutine
;
1158 nanoseconds_to_absolutetime( (uint64_t)ts
.tv_sec
* NSEC_PER_SEC
+ ts
.tv_nsec
,
1160 clock_absolutetime_interval_to_deadline( abstime
, &abstime
);
1163 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1164 if ( aiocbpp
== NULL
) {
1166 goto ExitThisRoutine
;
1169 /* check list of aio requests to see if any have completed */
1170 check_for_our_aiocbp
:
1171 aio_proc_lock_spin(p
);
1172 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1175 /* NULL elements are legal so check for 'em */
1176 aiocbp
= *(aiocbpp
+ i
);
1177 if ( aiocbp
== USER_ADDR_NULL
)
1180 /* return immediately if any aio request in the list is done */
1181 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1182 ASSERT_AIO_FROM_PROC(entryp
, p
);
1183 if ( entryp
->uaiocbp
== aiocbp
) {
1187 goto ExitThisRoutine
;
1190 } /* for ( ; i < uap->nent; ) */
1192 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend_sleep
)) | DBG_FUNC_NONE
,
1193 (int)p
, uap
->nent
, 0, 0, 0 );
1196 * wait for an async IO to complete or a signal fires or timeout expires.
1197 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1198 * interrupts us. If an async IO completes before a signal fires or our
1199 * timeout expires, we get a wakeup call from aio_work_thread().
1202 error
= msleep1(&p
->AIO_SUSPEND_SLEEP_CHAN
, aio_proc_mutex(p
), PCATCH
| PWAIT
| PDROP
, "aio_suspend", abstime
); /* XXX better priority? */
1205 * got our wakeup call from aio_work_thread().
1206 * Since we can get a wakeup on this channel from another thread in the
1207 * same process we head back up to make sure this is for the correct aiocbp.
1208 * If it is the correct aiocbp we will return from where we do the check
1209 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1210 * else we will fall out and just sleep again.
1212 goto check_for_our_aiocbp
;
1214 else if ( error
== EWOULDBLOCK
) {
1215 /* our timeout expired */
1219 /* we were interrupted */
1224 if ( aiocbpp
!= NULL
)
1225 FREE( aiocbpp
, M_TEMP
);
1227 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_END
,
1228 (int)p
, uap
->nent
, error
, 0, 0 );
1235 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1236 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1237 * (uap->aiocbp->aio_buf).
1241 aio_write(proc_t p
, struct aio_write_args
*uap
, int *retval
)
1247 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_START
,
1248 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
1250 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_WRITE
);
1254 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_END
,
1255 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
1262 static user_addr_t
*
1263 aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
)
1265 user_addr_t
*aiocbpp
;
1268 /* we reserve enough space for largest possible pointer size */
1269 MALLOC( aiocbpp
, user_addr_t
*, (nent
* sizeof(user_addr_t
)), M_TEMP
, M_WAITOK
);
1270 if ( aiocbpp
== NULL
)
1273 /* copyin our aiocb pointers from list */
1274 result
= copyin( aiocblist
, aiocbpp
,
1275 proc_is64bit(procp
) ? (nent
* sizeof(user64_addr_t
))
1276 : (nent
* sizeof(user32_addr_t
)) );
1278 FREE( aiocbpp
, M_TEMP
);
1284 * We depend on a list of user_addr_t's so we need to
1285 * munge and expand when these pointers came from a
1288 if ( !proc_is64bit(procp
) ) {
1289 /* copy from last to first to deal with overlap */
1290 user32_addr_t
*my_ptrp
= ((user32_addr_t
*)aiocbpp
) + (nent
- 1);
1291 user_addr_t
*my_addrp
= aiocbpp
+ (nent
- 1);
1293 for (i
= 0; i
< nent
; i
++, my_ptrp
--, my_addrp
--) {
1294 *my_addrp
= (user_addr_t
) (*my_ptrp
);
1304 aio_copy_in_sigev(proc_t procp
, user_addr_t sigp
, struct user_sigevent
*sigev
)
1308 if (sigp
== USER_ADDR_NULL
)
1312 * We need to munge aio_sigevent since it contains pointers.
1313 * Since we do not know if sigev_value is an int or a ptr we do
1314 * NOT cast the ptr to a user_addr_t. This means if we send
1315 * this info back to user space we need to remember sigev_value
1316 * was not expanded for the 32-bit case.
1318 * Notes: This does NOT affect us since we don't support
1319 * sigev_value yet in the aio context.
1321 if ( proc_is64bit(procp
) ) {
1322 struct user64_sigevent sigevent64
;
1324 result
= copyin( sigp
, &sigevent64
, sizeof(sigevent64
) );
1325 if ( result
== 0 ) {
1326 sigev
->sigev_notify
= sigevent64
.sigev_notify
;
1327 sigev
->sigev_signo
= sigevent64
.sigev_signo
;
1328 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent64
.sigev_value
.size_equivalent
.sival_int
;
1329 sigev
->sigev_notify_function
= sigevent64
.sigev_notify_function
;
1330 sigev
->sigev_notify_attributes
= sigevent64
.sigev_notify_attributes
;
1334 struct user32_sigevent sigevent32
;
1336 result
= copyin( sigp
, &sigevent32
, sizeof(sigevent32
) );
1337 if ( result
== 0 ) {
1338 sigev
->sigev_notify
= sigevent32
.sigev_notify
;
1339 sigev
->sigev_signo
= sigevent32
.sigev_signo
;
1340 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent32
.sigev_value
.sival_int
;
1341 sigev
->sigev_notify_function
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_function
);
1342 sigev
->sigev_notify_attributes
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_attributes
);
1346 if ( result
!= 0 ) {
1357 * Queue up the entry on the aio asynchronous work queue in priority order
1358 * based on the relative priority of the request. We calculate the relative
1359 * priority using the nice value of the caller and the value
1361 * Parameters: procp Process queueing the I/O
1362 * entryp The work queue entry being queued
1364 * Returns: (void) No failure modes
1366 * Notes: This function is used for both lio_listio and aio
1368 * XXX: At some point, we may have to consider thread priority
1369 * rather than process priority, but we don't maintain the
1370 * adjusted priority for threads the POSIX way.
1373 * Called with proc locked.
1376 aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
)
1379 aio_workq_entry
*my_entryp
; /* used for insertion sort */
1381 aio_workq_t queue
= aio_entry_workq(entryp
);
1383 if (proc_locked
== 0) {
1384 aio_proc_lock(procp
);
1387 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
1389 /* Onto proc queue */
1390 TAILQ_INSERT_TAIL(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
1391 procp
->p_aio_active_count
++;
1392 procp
->p_aio_total_count
++;
1394 /* And work queue */
1395 aio_workq_lock_spin(queue
);
1396 aio_workq_add_entry_locked(queue
, entryp
);
1397 waitq_wakeup64_one(&queue
->aioq_waitq
, CAST_EVENT64_T(queue
),
1398 THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
1399 aio_workq_unlock(queue
);
1401 if (proc_locked
== 0) {
1402 aio_proc_unlock(procp
);
1409 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1410 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1411 * which is [0..39], with 0 not being used. In nice values, the
1412 * lower the nice value, the higher the priority.
1413 * (3) The normalized scheduling prioritiy is the highest nice value
1414 * minus the current nice value. In I/O scheduling priority, the
1415 * higher the value the lower the priority, so it is the inverse
1416 * of the nice value (the higher the number, the higher the I/O
1418 * (4) From the normalized scheduling priority, we subtract the
1419 * request priority to get the request priority value number;
1420 * this means that requests are only capable of depressing their
1421 * priority relative to other requests,
1423 entryp
->priority
= (((2 * NZERO
) - 1) - procp
->p_nice
);
1425 /* only premit depressing the priority */
1426 if (entryp
->aiocb
.aio_reqprio
< 0)
1427 entryp
->aiocb
.aio_reqprio
= 0;
1428 if (entryp
->aiocb
.aio_reqprio
> 0) {
1429 entryp
->priority
-= entryp
->aiocb
.aio_reqprio
;
1430 if (entryp
->priority
< 0)
1431 entryp
->priority
= 0;
1434 /* Insertion sort the entry; lowest ->priority to highest */
1435 TAILQ_FOREACH(my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1436 if ( entryp
->priority
<= my_entryp
->priority
) {
1437 TAILQ_INSERT_BEFORE(my_entryp
, entryp
, aio_workq_link
);
1441 if (my_entryp
== NULL
)
1442 TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1448 * lio_listio - initiate a list of IO requests. We process the list of
1449 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1450 * (mode == LIO_NOWAIT).
1452 * The caller gets error and return status for each aiocb in the list
1453 * via aio_error and aio_return. We must keep completed requests until
1454 * released by the aio_return call.
1457 lio_listio(proc_t p
, struct lio_listio_args
*uap
, int *retval
)
1463 aio_workq_entry
**entryp_listp
;
1464 user_addr_t
*aiocbpp
;
1465 struct user_sigevent aiosigev
;
1466 aio_lio_context
*lio_context
;
1467 boolean_t free_context
= FALSE
;
1469 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_START
,
1470 (int)p
, uap
->nent
, uap
->mode
, 0, 0 );
1472 entryp_listp
= NULL
;
1477 if ( !(uap
->mode
== LIO_NOWAIT
|| uap
->mode
== LIO_WAIT
) ) {
1478 call_result
= EINVAL
;
1482 if ( uap
->nent
< 1 || uap
->nent
> AIO_LISTIO_MAX
) {
1483 call_result
= EINVAL
;
1488 * allocate a list of aio_workq_entry pointers that we will use
1489 * to queue up all our requests at once while holding our lock.
1491 MALLOC( entryp_listp
, void *, (uap
->nent
* sizeof(aio_workq_entry
*)), M_TEMP
, M_WAITOK
);
1492 if ( entryp_listp
== NULL
) {
1493 call_result
= EAGAIN
;
1497 MALLOC( lio_context
, aio_lio_context
*, sizeof(aio_lio_context
), M_TEMP
, M_WAITOK
);
1498 if ( lio_context
== NULL
) {
1499 call_result
= EAGAIN
;
1504 OSIncrementAtomic(&lio_contexts_alloced
);
1507 bzero(lio_context
, sizeof(aio_lio_context
));
1509 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1510 if ( aiocbpp
== NULL
) {
1511 call_result
= EAGAIN
;
1516 * Use sigevent passed in to lio_listio for each of our calls, but
1517 * only do completion notification after the last request completes.
1519 bzero(&aiosigev
, sizeof(aiosigev
));
1520 /* Only copy in an sigev if the user supplied one */
1521 if (uap
->sigp
!= USER_ADDR_NULL
) {
1522 call_result
= aio_copy_in_sigev(p
, uap
->sigp
, &aiosigev
);
1527 /* process list of aio requests */
1528 lio_context
->io_issued
= uap
->nent
;
1529 lio_context
->io_waiter
= uap
->mode
== LIO_WAIT
? 1 : 0; /* Should it be freed by last AIO */
1530 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1531 user_addr_t my_aiocbp
;
1532 aio_workq_entry
*entryp
;
1534 *(entryp_listp
+ i
) = NULL
;
1535 my_aiocbp
= *(aiocbpp
+ i
);
1537 /* NULL elements are legal so check for 'em */
1538 if ( my_aiocbp
== USER_ADDR_NULL
) {
1539 aio_proc_lock_spin(p
);
1540 lio_context
->io_issued
--;
1546 * We use lio_context to mark IO requests for delayed completion
1547 * processing which means we wait until all IO requests in the
1548 * group have completed before we either return to the caller
1549 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1551 * We use the address of the lio_context for this, since it is
1552 * unique in the address space.
1554 result
= lio_create_entry( p
, my_aiocbp
, lio_context
, (entryp_listp
+ i
) );
1555 if ( result
!= 0 && call_result
== -1 )
1556 call_result
= result
;
1558 /* NULL elements are legal so check for 'em */
1559 entryp
= *(entryp_listp
+ i
);
1560 if ( entryp
== NULL
) {
1561 aio_proc_lock_spin(p
);
1562 lio_context
->io_issued
--;
1567 if ( uap
->mode
== LIO_NOWAIT
) {
1568 /* Set signal hander, if any */
1569 entryp
->aiocb
.aio_sigevent
= aiosigev
;
1571 /* flag that this thread blocks pending completion */
1572 entryp
->flags
|= AIO_LIO_NOTIFY
;
1575 /* check our aio limits to throttle bad or rude user land behavior */
1576 old_count
= aio_increment_total_count();
1578 aio_proc_lock_spin(p
);
1579 if ( old_count
>= aio_max_requests
||
1580 aio_get_process_count( entryp
->procp
) >= aio_max_requests_per_process
||
1581 is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1583 lio_context
->io_issued
--;
1586 aio_decrement_total_count();
1588 if ( call_result
== -1 )
1589 call_result
= EAGAIN
;
1590 aio_free_request(entryp
);
1591 entryp_listp
[i
] = NULL
;
1595 lck_mtx_convert_spin(aio_proc_mutex(p
));
1596 aio_enqueue_work(p
, entryp
, 1);
1599 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1600 (int)p
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1605 aio_proc_lock_spin(p
);
1606 while (lio_context
->io_completed
< lio_context
->io_issued
) {
1607 result
= msleep(lio_context
, aio_proc_mutex(p
), PCATCH
| PRIBIO
| PSPIN
, "lio_listio", 0);
1609 /* If we were interrupted, fail out (even if all finished) */
1611 call_result
= EINTR
;
1612 lio_context
->io_waiter
= 0;
1617 /* If all IOs have finished must free it */
1618 if (lio_context
->io_completed
== lio_context
->io_issued
) {
1619 free_context
= TRUE
;
1629 /* call_result == -1 means we had no trouble queueing up requests */
1630 if ( call_result
== -1 ) {
1636 if ( entryp_listp
!= NULL
)
1637 FREE( entryp_listp
, M_TEMP
);
1638 if ( aiocbpp
!= NULL
)
1639 FREE( aiocbpp
, M_TEMP
);
1640 if ((lio_context
!= NULL
) && ((lio_context
->io_issued
== 0) || (free_context
== TRUE
))) {
1641 free_lio_context(lio_context
);
1644 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_END
,
1645 (int)p
, call_result
, 0, 0, 0 );
1647 return( call_result
);
1653 * aio worker thread. this is where all the real work gets done.
1654 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1655 * after new work is queued up.
1657 __attribute__((noreturn
))
1659 aio_work_thread(void)
1661 aio_workq_entry
*entryp
;
1663 vm_map_t currentmap
;
1664 vm_map_t oldmap
= VM_MAP_NULL
;
1665 task_t oldaiotask
= TASK_NULL
;
1666 struct uthread
*uthreadp
= NULL
;
1670 * returns with the entry ref'ed.
1671 * sleeps until work is available.
1673 entryp
= aio_get_some_work();
1675 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_START
,
1676 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->flags
, 0, 0 );
1679 * Assume the target's address space identity for the duration
1680 * of the IO. Note: don't need to have the entryp locked,
1681 * because the proc and map don't change until it's freed.
1683 currentmap
= get_task_map( (current_proc())->task
);
1684 if ( currentmap
!= entryp
->aio_map
) {
1685 uthreadp
= (struct uthread
*) get_bsdthread_info(current_thread());
1686 oldaiotask
= uthreadp
->uu_aio_task
;
1687 uthreadp
->uu_aio_task
= entryp
->procp
->task
;
1688 oldmap
= vm_map_switch( entryp
->aio_map
);
1691 if ( (entryp
->flags
& AIO_READ
) != 0 ) {
1692 error
= do_aio_read( entryp
);
1694 else if ( (entryp
->flags
& AIO_WRITE
) != 0 ) {
1695 error
= do_aio_write( entryp
);
1697 else if ( (entryp
->flags
& (AIO_FSYNC
| AIO_DSYNC
)) != 0 ) {
1698 error
= do_aio_fsync( entryp
);
1701 printf( "%s - unknown aio request - flags 0x%02X \n",
1702 __FUNCTION__
, entryp
->flags
);
1706 /* Restore old map */
1707 if ( currentmap
!= entryp
->aio_map
) {
1708 (void) vm_map_switch( oldmap
);
1709 uthreadp
->uu_aio_task
= oldaiotask
;
1712 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_END
,
1713 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->errorval
,
1714 entryp
->returnval
, 0 );
1718 aio_entry_lock_spin(entryp
);
1719 entryp
->errorval
= error
;
1720 aio_entry_unlock(entryp
);
1722 /* we're done with the IO request so pop it off the active queue and */
1723 /* push it on the done queue */
1724 aio_proc_lock(entryp
->procp
);
1725 aio_proc_move_done_locked(entryp
->procp
, entryp
);
1726 aio_proc_unlock(entryp
->procp
);
1728 OSDecrementAtomic(&aio_anchor
.aio_inflight_count
);
1730 /* remove our reference to the user land map. */
1731 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
1734 my_map
= entryp
->aio_map
;
1735 entryp
->aio_map
= VM_MAP_NULL
;
1736 vm_map_deallocate( my_map
);
1739 /* Provide notifications */
1740 do_aio_completion( entryp
);
1742 /* Will free if needed */
1743 aio_entry_unref(entryp
);
1749 } /* aio_work_thread */
1753 * aio_get_some_work - get the next async IO request that is ready to be executed.
1754 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1755 * IO requests at the time the aio_fsync call came in have completed.
1756 * NOTE - AIO_LOCK must be held by caller
1758 static aio_workq_entry
*
1759 aio_get_some_work( void )
1761 aio_workq_entry
*entryp
= NULL
;
1762 aio_workq_t queue
= NULL
;
1764 /* Just one queue for the moment. In the future there will be many. */
1765 queue
= &aio_anchor
.aio_async_workqs
[0];
1766 aio_workq_lock_spin(queue
);
1767 if (queue
->aioq_count
== 0) {
1772 * Hold the queue lock.
1774 * pop some work off the work queue and add to our active queue
1775 * Always start with the queue lock held.
1779 * Pull of of work queue. Once it's off, it can't be cancelled,
1780 * so we can take our ref once we drop the queue lock.
1782 entryp
= TAILQ_FIRST(&queue
->aioq_entries
);
1785 * If there's no work or only fsyncs that need delay, go to sleep
1786 * and then start anew from aio_work_thread
1788 if (entryp
== NULL
) {
1792 aio_workq_remove_entry_locked(queue
, entryp
);
1794 aio_workq_unlock(queue
);
1797 * Check if it's an fsync that must be delayed. No need to lock the entry;
1798 * that flag would have been set at initialization.
1800 if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1802 * Check for unfinished operations on the same file
1803 * in this proc's queue.
1805 aio_proc_lock_spin(entryp
->procp
);
1806 if ( aio_delay_fsync_request( entryp
) ) {
1807 /* It needs to be delayed. Put it back on the end of the work queue */
1808 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync_delay
)) | DBG_FUNC_NONE
,
1809 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1811 aio_proc_unlock(entryp
->procp
);
1813 aio_workq_lock_spin(queue
);
1814 aio_workq_add_entry_locked(queue
, entryp
);
1817 aio_proc_unlock(entryp
->procp
);
1823 aio_entry_ref(entryp
);
1825 OSIncrementAtomic(&aio_anchor
.aio_inflight_count
);
1829 /* We will wake up when someone enqueues something */
1830 waitq_assert_wait64(&queue
->aioq_waitq
, CAST_EVENT64_T(queue
), THREAD_UNINT
, 0);
1831 aio_workq_unlock(queue
);
1832 thread_block( (thread_continue_t
)aio_work_thread
);
1839 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1840 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1841 * not been completed.
1844 aio_delay_fsync_request( aio_workq_entry
*entryp
)
1846 if (entryp
== TAILQ_FIRST(&entryp
->procp
->p_aio_activeq
)) {
1851 } /* aio_delay_fsync_request */
1853 static aio_workq_entry
*
1854 aio_create_queue_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
, int kindOfIO
)
1856 aio_workq_entry
*entryp
;
1859 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1860 if ( entryp
== NULL
) {
1865 bzero( entryp
, sizeof(*entryp
) );
1867 /* fill in the rest of the aio_workq_entry */
1868 entryp
->procp
= procp
;
1869 entryp
->uaiocbp
= aiocbp
;
1870 entryp
->flags
|= kindOfIO
;
1871 entryp
->group_tag
= group_tag
;
1872 entryp
->aio_map
= VM_MAP_NULL
;
1873 entryp
->aio_refcount
= 0;
1875 if ( proc_is64bit(procp
) ) {
1876 struct user64_aiocb aiocb64
;
1878 result
= copyin( aiocbp
, &aiocb64
, sizeof(aiocb64
) );
1880 do_munge_aiocb_user64_to_user(&aiocb64
, &entryp
->aiocb
);
1883 struct user32_aiocb aiocb32
;
1885 result
= copyin( aiocbp
, &aiocb32
, sizeof(aiocb32
) );
1887 do_munge_aiocb_user32_to_user( &aiocb32
, &entryp
->aiocb
);
1890 if ( result
!= 0 ) {
1895 /* get a reference to the user land map in order to keep it around */
1896 entryp
->aio_map
= get_task_map( procp
->task
);
1897 vm_map_reference( entryp
->aio_map
);
1899 /* do some more validation on the aiocb and embedded file descriptor */
1900 result
= aio_validate( entryp
);
1902 goto error_exit_with_ref
;
1904 /* get a reference on the current_thread, which is passed in vfs_context. */
1905 entryp
->thread
= current_thread();
1906 thread_reference( entryp
->thread
);
1909 error_exit_with_ref
:
1910 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
1911 vm_map_deallocate( entryp
->aio_map
);
1914 if ( result
&& entryp
!= NULL
) {
1915 zfree( aio_workq_zonep
, entryp
);
1924 * aio_queue_async_request - queue up an async IO request on our work queue then
1925 * wake up one of our worker threads to do the actual work. We get a reference
1926 * to our caller's user land map in order to keep it around while we are
1927 * processing the request.
1930 aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
)
1932 aio_workq_entry
*entryp
;
1936 old_count
= aio_increment_total_count();
1937 if (old_count
>= aio_max_requests
) {
1942 entryp
= aio_create_queue_entry( procp
, aiocbp
, 0, kindOfIO
);
1943 if ( entryp
== NULL
) {
1949 aio_proc_lock_spin(procp
);
1951 if ( is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1956 /* check our aio limits to throttle bad or rude user land behavior */
1957 if (aio_get_process_count( procp
) >= aio_max_requests_per_process
) {
1958 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp
->p_aio_total_count
);
1963 /* Add the IO to proc and work queues, wake up threads as appropriate */
1964 lck_mtx_convert_spin(aio_proc_mutex(procp
));
1965 aio_enqueue_work(procp
, entryp
, 1);
1967 aio_proc_unlock(procp
);
1969 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1970 (int)procp
, (int)aiocbp
, 0, 0, 0 );
1976 * This entry has not been queued up so no worries about
1977 * unlocked state and aio_map
1979 aio_proc_unlock(procp
);
1980 aio_free_request(entryp
);
1983 aio_decrement_total_count();
1987 } /* aio_queue_async_request */
1993 * Allocate an aio_workq_entry and fill it in. If all goes well return 0
1994 * and pass the aio_workq_entry pointer back to our caller.
1996 * Parameters: procp The process makign the request
1997 * aiocbp The aio context buffer pointer
1998 * group_tag The group tag used to indicate a
1999 * group of operations has completed
2000 * entrypp Pointer to the pointer to receive the
2001 * address of the created aio_workq_entry
2003 * Returns: 0 Successfully created
2004 * EAGAIN Try again (usually resource shortage)
2007 * Notes: We get a reference to our caller's user land map in order
2008 * to keep it around while we are processing the request.
2010 * lio_listio calls behave differently at completion they do
2011 * completion notification when all async IO requests have
2012 * completed. We use group_tag to tag IO requests that behave
2013 * in the delay notification manner.
2015 * All synchronous operations are considered to not have a
2016 * signal routine associated with them (sigp == USER_ADDR_NULL).
2019 lio_create_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
,
2020 aio_workq_entry
**entrypp
)
2022 aio_workq_entry
*entryp
;
2025 entryp
= aio_create_queue_entry( procp
, aiocbp
, group_tag
, AIO_LIO
);
2026 if ( entryp
== NULL
) {
2032 * Look for lio_listio LIO_NOP requests and ignore them; this is
2033 * not really an error, but we need to free our aio_workq_entry.
2035 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
2045 if ( entryp
!= NULL
) {
2047 * This entry has not been queued up so no worries about
2048 * unlocked state and aio_map
2050 aio_free_request(entryp
);
2055 } /* lio_create_entry */
2059 * aio_free_request - remove our reference on the user land map and
2060 * free the work queue entry resources. The entry is off all lists
2061 * and has zero refcount, so no one can have a pointer to it.
2065 aio_free_request(aio_workq_entry
*entryp
)
2067 /* remove our reference to the user land map. */
2068 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
2069 vm_map_deallocate(entryp
->aio_map
);
2072 /* remove our reference to thread which enqueued the request */
2073 if ( NULL
!= entryp
->thread
) {
2074 thread_deallocate( entryp
->thread
);
2077 entryp
->aio_refcount
= -1; /* A bit of poisoning in case of bad refcounting. */
2079 zfree( aio_workq_zonep
, entryp
);
2083 } /* aio_free_request */
2089 * validate the aiocb passed in by one of the aio syscalls.
2092 aio_validate( aio_workq_entry
*entryp
)
2094 struct fileproc
*fp
;
2100 if ( (entryp
->flags
& AIO_LIO
) != 0 ) {
2101 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_READ
)
2102 entryp
->flags
|= AIO_READ
;
2103 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_WRITE
)
2104 entryp
->flags
|= AIO_WRITE
;
2105 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
)
2112 if ( (entryp
->flags
& (AIO_WRITE
| AIO_FSYNC
| AIO_DSYNC
)) != 0 ) {
2116 if ( (entryp
->flags
& (AIO_READ
| AIO_WRITE
)) != 0 ) {
2117 if ( entryp
->aiocb
.aio_nbytes
> INT_MAX
||
2118 entryp
->aiocb
.aio_buf
== USER_ADDR_NULL
||
2119 entryp
->aiocb
.aio_offset
< 0 )
2124 * validate aiocb.aio_sigevent. at this point we only support
2125 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
2126 * sigev_value, sigev_notify_function, and sigev_notify_attributes
2127 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
2128 * with no [RTS] (RalTime Signal) option group support.
2130 switch ( entryp
->aiocb
.aio_sigevent
.sigev_notify
) {
2135 /* make sure we have a valid signal number */
2136 signum
= entryp
->aiocb
.aio_sigevent
.sigev_signo
;
2137 if ( signum
<= 0 || signum
>= NSIG
||
2138 signum
== SIGKILL
|| signum
== SIGSTOP
)
2147 /* Unsupported [RTS] */
2153 /* validate the file descriptor and that the file was opened
2154 * for the appropriate read / write access.
2156 proc_fdlock(entryp
->procp
);
2158 result
= fp_lookup( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 1);
2159 if ( result
== 0 ) {
2160 if ( (fp
->f_fglob
->fg_flag
& flag
) == 0 ) {
2161 /* we don't have read or write access */
2164 else if ( FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_VNODE
) {
2165 /* this is not a file */
2168 fp
->f_flags
|= FP_AIOISSUED
;
2170 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 1);
2176 proc_fdunlock(entryp
->procp
);
2180 } /* aio_validate */
2183 aio_increment_total_count()
2185 return OSIncrementAtomic(&aio_anchor
.aio_total_count
);
2189 aio_decrement_total_count()
2191 int old
= OSDecrementAtomic(&aio_anchor
.aio_total_count
);
2193 panic("Negative total AIO count!\n");
2200 aio_get_process_count(proc_t procp
)
2202 return procp
->p_aio_total_count
;
2204 } /* aio_get_process_count */
2207 aio_get_all_queues_count( void )
2209 return aio_anchor
.aio_total_count
;
2211 } /* aio_get_all_queues_count */
2215 * do_aio_completion. Handle async IO completion.
2218 do_aio_completion( aio_workq_entry
*entryp
)
2221 boolean_t lastLioCompleted
= FALSE
;
2222 aio_lio_context
*lio_context
= NULL
;
2225 lio_context
= (aio_lio_context
*)entryp
->group_tag
;
2227 if (lio_context
!= NULL
) {
2229 aio_proc_lock_spin(entryp
->procp
);
2231 /* Account for this I/O completing. */
2232 lio_context
->io_completed
++;
2234 /* Are we done with this lio context? */
2235 if (lio_context
->io_issued
== lio_context
->io_completed
) {
2236 lastLioCompleted
= TRUE
;
2239 waiter
= lio_context
->io_waiter
;
2241 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2242 if ((entryp
->flags
& AIO_LIO_NOTIFY
) && (lastLioCompleted
) && (waiter
!= 0)) {
2243 /* wake up the waiter */
2244 wakeup(lio_context
);
2247 aio_proc_unlock(entryp
->procp
);
2250 if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
== SIGEV_SIGNAL
&&
2251 (entryp
->flags
& AIO_DISABLE
) == 0 ) {
2253 boolean_t performSignal
= FALSE
;
2254 if (lio_context
== NULL
) {
2255 performSignal
= TRUE
;
2259 * If this was the last request in the group and a signal
2260 * is desired, send one.
2262 performSignal
= lastLioCompleted
;
2265 if (performSignal
) {
2267 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_sig
)) | DBG_FUNC_NONE
,
2268 (int)entryp
->procp
, (int)entryp
->uaiocbp
,
2269 entryp
->aiocb
.aio_sigevent
.sigev_signo
, 0, 0 );
2271 psignal( entryp
->procp
, entryp
->aiocb
.aio_sigevent
.sigev_signo
);
2275 if ((entryp
->flags
& AIO_EXIT_WAIT
) && (entryp
->flags
& AIO_CLOSE_WAIT
)) {
2276 panic("Close and exit flags set at the same time\n");
2280 * need to handle case where a process is trying to exit, exec, or
2281 * close and is currently waiting for active aio requests to complete.
2282 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2283 * other requests in the active queue for this process. If there are
2284 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2285 * If there are some still active then do nothing - we only want to
2286 * wakeup when all active aio requests for the process are complete.
2288 * Don't need to lock the entry or proc to check the cleanup flag. It can only be
2289 * set for cancellation, while the entryp is still on a proc list; now it's
2290 * off, so that flag is already set if it's going to be.
2292 if ( (entryp
->flags
& AIO_EXIT_WAIT
) != 0 ) {
2293 int active_requests
;
2295 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2296 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2298 aio_proc_lock_spin(entryp
->procp
);
2299 active_requests
= aio_active_requests_for_process( entryp
->procp
);
2300 if ( active_requests
< 1 ) {
2302 * no active aio requests for this process, continue exiting. In this
2303 * case, there should be no one else waiting ont he proc in AIO...
2305 wakeup_one((caddr_t
)&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2306 aio_proc_unlock(entryp
->procp
);
2308 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2309 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2311 aio_proc_unlock(entryp
->procp
);
2315 if ( (entryp
->flags
& AIO_CLOSE_WAIT
) != 0 ) {
2316 int active_requests
;
2318 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2319 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2321 aio_proc_lock_spin(entryp
->procp
);
2322 active_requests
= aio_proc_active_requests_for_file( entryp
->procp
, entryp
->aiocb
.aio_fildes
);
2323 if ( active_requests
< 1 ) {
2324 /* Can't wakeup_one(); multiple closes might be in progress. */
2325 wakeup(&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2326 aio_proc_unlock(entryp
->procp
);
2328 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2329 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2331 aio_proc_unlock(entryp
->procp
);
2335 * A thread in aio_suspend() wants to known about completed IOs. If it checked
2336 * the done list before we moved our AIO there, then it already asserted its wait,
2337 * and we can wake it up without holding the lock. If it checked the list after
2338 * we did our move, then it already has seen the AIO that we moved. Herego, we
2339 * can do our wakeup without holding the lock.
2341 wakeup( (caddr_t
) &entryp
->procp
->AIO_SUSPEND_SLEEP_CHAN
);
2342 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_suspend_wake
)) | DBG_FUNC_NONE
,
2343 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2346 * free the LIO context if the last lio completed and no thread is
2349 if (lastLioCompleted
&& (waiter
== 0))
2350 free_lio_context (lio_context
);
2353 } /* do_aio_completion */
2360 do_aio_read( aio_workq_entry
*entryp
)
2362 struct fileproc
*fp
;
2364 struct vfs_context context
;
2366 if ( (error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0)) )
2368 if ( (fp
->f_fglob
->fg_flag
& FREAD
) == 0 ) {
2369 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2373 context
.vc_thread
= entryp
->thread
; /* XXX */
2374 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2376 error
= dofileread(&context
, fp
,
2377 entryp
->aiocb
.aio_buf
,
2378 entryp
->aiocb
.aio_nbytes
,
2379 entryp
->aiocb
.aio_offset
, FOF_OFFSET
,
2380 &entryp
->returnval
);
2381 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2392 do_aio_write( aio_workq_entry
*entryp
)
2394 struct fileproc
*fp
;
2396 struct vfs_context context
;
2398 if ( (error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0)) )
2400 if ( (fp
->f_fglob
->fg_flag
& FWRITE
) == 0 ) {
2401 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2406 if ( (fp
->f_fglob
->fg_flag
& O_APPEND
) == 0 ) {
2407 flags
|= FOF_OFFSET
;
2410 context
.vc_thread
= entryp
->thread
; /* XXX */
2411 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2413 /* NB: tell dofilewrite the offset, and to use the proc cred */
2414 error
= dofilewrite(&context
,
2416 entryp
->aiocb
.aio_buf
,
2417 entryp
->aiocb
.aio_nbytes
,
2418 entryp
->aiocb
.aio_offset
,
2420 &entryp
->returnval
);
2422 if (entryp
->returnval
)
2423 fp_drop_written(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
);
2425 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2429 } /* do_aio_write */
2433 * aio_active_requests_for_process - return number of active async IO
2434 * requests for the given process.
2437 aio_active_requests_for_process(proc_t procp
)
2439 return( procp
->p_aio_active_count
);
2441 } /* aio_active_requests_for_process */
2444 * Called with the proc locked.
2447 aio_proc_active_requests_for_file(proc_t procp
, int fd
)
2450 aio_workq_entry
*entryp
;
2451 TAILQ_FOREACH(entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2452 if (entryp
->aiocb
.aio_fildes
== fd
) {
2458 } /* aio_active_requests_for_process */
2466 do_aio_fsync( aio_workq_entry
*entryp
)
2468 struct vfs_context context
;
2470 struct fileproc
*fp
;
2475 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2477 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2478 * to mark for update the metadata not strictly necessary for data
2479 * retrieval, rather than forcing it to disk.
2481 * If AIO_FSYNC is set, we have to also wait for metadata not really
2482 * necessary to data retrival are committed to stable storage (e.g.
2483 * atime, mtime, ctime, etc.).
2485 * Metadata necessary for data retrieval ust be committed to stable
2486 * storage in either case (file length, etc.).
2488 if (entryp
->flags
& AIO_FSYNC
)
2489 sync_flag
= MNT_WAIT
;
2491 sync_flag
= MNT_DWAIT
;
2493 error
= fp_getfvp( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, &vp
);
2495 if ( (error
= vnode_getwithref(vp
)) ) {
2496 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2497 entryp
->returnval
= -1;
2500 context
.vc_thread
= current_thread();
2501 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2503 error
= VNOP_FSYNC( vp
, sync_flag
, &context
);
2505 (void)vnode_put(vp
);
2507 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2510 entryp
->returnval
= -1;
2514 } /* do_aio_fsync */
2518 * is_already_queued - runs through our queues to see if the given
2519 * aiocbp / process is there. Returns TRUE if there is a match
2520 * on any of our aio queues.
2522 * Called with proc aio lock held (can be held spin)
2525 is_already_queued(proc_t procp
,
2526 user_addr_t aiocbp
)
2528 aio_workq_entry
*entryp
;
2533 /* look for matches on our queue of async IO requests that have completed */
2534 TAILQ_FOREACH( entryp
, &procp
->p_aio_doneq
, aio_proc_link
) {
2535 if ( aiocbp
== entryp
->uaiocbp
) {
2537 goto ExitThisRoutine
;
2541 /* look for matches on our queue of active async IO requests */
2542 TAILQ_FOREACH( entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2543 if ( aiocbp
== entryp
->uaiocbp
) {
2545 goto ExitThisRoutine
;
2552 } /* is_already_queued */
2556 free_lio_context(aio_lio_context
* context
)
2560 OSDecrementAtomic(&lio_contexts_alloced
);
2563 FREE( context
, M_TEMP
);
2565 } /* free_lio_context */
2569 * aio initialization
2571 __private_extern__
void
2576 aio_lock_grp_attr
= lck_grp_attr_alloc_init();
2577 aio_proc_lock_grp
= lck_grp_alloc_init("aio_proc", aio_lock_grp_attr
);;
2578 aio_entry_lock_grp
= lck_grp_alloc_init("aio_entry", aio_lock_grp_attr
);;
2579 aio_queue_lock_grp
= lck_grp_alloc_init("aio_queue", aio_lock_grp_attr
);;
2580 aio_lock_attr
= lck_attr_alloc_init();
2582 lck_mtx_init(&aio_entry_mtx
, aio_entry_lock_grp
, aio_lock_attr
);
2583 lck_mtx_init(&aio_proc_mtx
, aio_proc_lock_grp
, aio_lock_attr
);
2585 aio_anchor
.aio_inflight_count
= 0;
2586 aio_anchor
.aio_done_count
= 0;
2587 aio_anchor
.aio_total_count
= 0;
2588 aio_anchor
.aio_num_workqs
= AIO_NUM_WORK_QUEUES
;
2590 for (i
= 0; i
< AIO_NUM_WORK_QUEUES
; i
++) {
2591 aio_workq_init(&aio_anchor
.aio_async_workqs
[i
]);
2595 i
= sizeof( aio_workq_entry
);
2596 aio_workq_zonep
= zinit( i
, i
* aio_max_requests
, i
* aio_max_requests
, "aiowq" );
2598 _aio_create_worker_threads( aio_worker_threads
);
2604 * aio worker threads created here.
2606 __private_extern__
void
2607 _aio_create_worker_threads( int num
)
2611 /* create some worker threads to handle the async IO requests */
2612 for ( i
= 0; i
< num
; i
++ ) {
2615 if ( KERN_SUCCESS
!= kernel_thread_start((thread_continue_t
)aio_work_thread
, NULL
, &myThread
) ) {
2616 printf( "%s - failed to create a work thread \n", __FUNCTION__
);
2619 thread_deallocate(myThread
);
2624 } /* _aio_create_worker_threads */
2627 * Return the current activation utask
2632 return ((struct uthread
*)get_bsdthread_info(current_thread()))->uu_aio_task
;
2637 * In the case of an aiocb from a
2638 * 32-bit process we need to expand some longs and pointers to the correct
2639 * sizes in order to let downstream code always work on the same type of
2640 * aiocb (in our case that is a user_aiocb)
2643 do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2645 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2646 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2647 the_user_aiocbp
->aio_buf
= CAST_USER_ADDR_T(my_aiocbp
->aio_buf
);
2648 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2649 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2650 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2652 /* special case here. since we do not know if sigev_value is an */
2653 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2654 /* means if we send this info back to user space we need to remember */
2655 /* sigev_value was not expanded for the 32-bit case. */
2656 /* NOTE - this does NOT affect us since we don't support sigev_value */
2657 /* yet in the aio context. */
2659 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2660 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2661 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2662 my_aiocbp
->aio_sigevent
.sigev_value
.sival_int
;
2663 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2664 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_function
);
2665 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2666 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_attributes
);
2669 /* Similar for 64-bit user process, so that we don't need to satisfy
2670 * the alignment constraints of the original user64_aiocb
2673 do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2675 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2676 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2677 the_user_aiocbp
->aio_buf
= my_aiocbp
->aio_buf
;
2678 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2679 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2680 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2682 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2683 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2684 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2685 my_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
;
2686 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2687 my_aiocbp
->aio_sigevent
.sigev_notify_function
;
2688 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2689 my_aiocbp
->aio_sigevent
.sigev_notify_attributes
;