2 * Copyright (c) 2003-2016 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
62 #include <machine/limits.h>
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/waitq.h>
67 #include <kern/zalloc.h>
68 #include <kern/task.h>
69 #include <kern/sched_prim.h>
71 #include <vm/vm_map.h>
73 #include <libkern/OSAtomic.h>
75 #include <sys/kdebug.h>
76 #define AIO_work_queued 1
77 #define AIO_worker_wake 2
78 #define AIO_completion_sig 3
79 #define AIO_completion_cleanup_wait 4
80 #define AIO_completion_cleanup_wake 5
81 #define AIO_completion_suspend_wake 6
82 #define AIO_fsync_delay 7
84 #define AIO_cancel_async_workq 11
85 #define AIO_cancel_sync_workq 12
86 #define AIO_cancel_activeq 13
87 #define AIO_cancel_doneq 14
93 #define AIO_error_val 61
94 #define AIO_error_activeq 62
95 #define AIO_error_workq 63
97 #define AIO_return_val 71
98 #define AIO_return_activeq 72
99 #define AIO_return_workq 73
102 #define AIO_exit_sleep 91
103 #define AIO_close 100
104 #define AIO_close_sleep 101
105 #define AIO_suspend 110
106 #define AIO_suspend_sleep 111
107 #define AIO_worker_thread 120
111 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
115 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
116 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
117 * (proc.aio_activeq) when one of our worker threads start the IO.
118 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
119 * when the IO request completes. The request remains on aio_doneq until
120 * user process calls aio_return or the process exits, either way that is our
121 * trigger to release aio resources.
123 typedef struct aio_workq
{
124 TAILQ_HEAD(, aio_workq_entry
) aioq_entries
;
127 struct waitq aioq_waitq
;
130 #define AIO_NUM_WORK_QUEUES 1
133 volatile int32_t aio_inflight_count
; /* entries that have been taken from a workq */
134 volatile int32_t aio_done_count
; /* entries on all done queues (proc.aio_doneq) */
135 volatile int32_t aio_total_count
; /* total extant entries */
137 /* Hash table of queues here */
139 struct aio_workq aio_async_workqs
[AIO_NUM_WORK_QUEUES
];
141 typedef struct aio_anchor_cb aio_anchor_cb
;
143 struct aio_lio_context
149 typedef struct aio_lio_context aio_lio_context
;
153 * Notes on aio sleep / wake channels.
154 * We currently pick a couple fields within the proc structure that will allow
155 * us sleep channels that currently do not collide with any other kernel routines.
156 * At this time, for binary compatibility reasons, we cannot create new proc fields.
158 #define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
159 #define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
161 #define ASSERT_AIO_FROM_PROC(aiop, theproc) \
162 if ((aiop)->procp != (theproc)) { \
163 panic("AIO on a proc list that does not belong to that proc.\n"); \
169 static void aio_proc_lock(proc_t procp
);
170 static void aio_proc_lock_spin(proc_t procp
);
171 static void aio_proc_unlock(proc_t procp
);
172 static lck_mtx_t
* aio_proc_mutex(proc_t procp
);
173 static void aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
174 static void aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
175 static int aio_get_process_count(proc_t procp
);
176 static int aio_active_requests_for_process(proc_t procp
);
177 static int aio_proc_active_requests_for_file(proc_t procp
, int fd
);
178 static boolean_t
is_already_queued(proc_t procp
, user_addr_t aiocbp
);
179 static boolean_t
should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
);
181 static void aio_entry_lock(aio_workq_entry
*entryp
);
182 static void aio_entry_lock_spin(aio_workq_entry
*entryp
);
183 static aio_workq_t
aio_entry_workq(aio_workq_entry
*entryp
);
184 static lck_mtx_t
* aio_entry_mutex(__unused aio_workq_entry
*entryp
);
185 static void aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
186 static void aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
187 static void aio_entry_ref_locked(aio_workq_entry
*entryp
);
188 static void aio_entry_unref_locked(aio_workq_entry
*entryp
);
189 static void aio_entry_ref(aio_workq_entry
*entryp
);
190 static void aio_entry_unref(aio_workq_entry
*entryp
);
191 static void aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
,
192 int wait_for_completion
, boolean_t disable_notification
);
193 static int aio_entry_try_workq_remove(aio_workq_entry
*entryp
);
194 static boolean_t
aio_delay_fsync_request( aio_workq_entry
*entryp
);
195 static int aio_free_request(aio_workq_entry
*entryp
);
197 static void aio_workq_init(aio_workq_t wq
);
198 static void aio_workq_lock_spin(aio_workq_t wq
);
199 static void aio_workq_unlock(aio_workq_t wq
);
200 static lck_mtx_t
* aio_workq_mutex(aio_workq_t wq
);
202 static void aio_work_thread( void );
203 static aio_workq_entry
*aio_get_some_work( void );
205 static int aio_get_all_queues_count( void );
206 static int aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
);
207 static int aio_validate( aio_workq_entry
*entryp
);
208 static int aio_increment_total_count(void);
209 static int aio_decrement_total_count(void);
211 static int do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
, int wait_for_completion
, boolean_t disable_notification
);
212 static void do_aio_completion( aio_workq_entry
*entryp
);
213 static int do_aio_fsync( aio_workq_entry
*entryp
);
214 static int do_aio_read( aio_workq_entry
*entryp
);
215 static int do_aio_write( aio_workq_entry
*entryp
);
216 static void do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
217 static void do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
218 static int lio_create_entry(proc_t procp
,
221 aio_workq_entry
**entrypp
);
222 static aio_workq_entry
*aio_create_queue_entry(proc_t procp
,
226 static user_addr_t
*aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
);
227 static void free_lio_context(aio_lio_context
* context
);
228 static void aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
);
230 #define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
231 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
232 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
235 * EXTERNAL PROTOTYPES
238 /* in ...bsd/kern/sys_generic.c */
239 extern int dofileread(vfs_context_t ctx
, struct fileproc
*fp
,
240 user_addr_t bufp
, user_size_t nbyte
,
241 off_t offset
, int flags
, user_ssize_t
*retval
);
242 extern int dofilewrite(vfs_context_t ctx
, struct fileproc
*fp
,
243 user_addr_t bufp
, user_size_t nbyte
, off_t offset
,
244 int flags
, user_ssize_t
*retval
);
246 static uint32_t lio_contexts_alloced
= 0;
250 * aio external global variables.
252 extern int aio_max_requests
; /* AIO_MAX - configurable */
253 extern int aio_max_requests_per_process
; /* AIO_PROCESS_MAX - configurable */
254 extern int aio_worker_threads
; /* AIO_THREAD_COUNT - configurable */
258 * aio static variables.
260 static aio_anchor_cb aio_anchor
;
261 static lck_grp_t
*aio_proc_lock_grp
;
262 static lck_grp_t
*aio_entry_lock_grp
;
263 static lck_grp_t
*aio_queue_lock_grp
;
264 static lck_attr_t
*aio_lock_attr
;
265 static lck_grp_attr_t
*aio_lock_grp_attr
;
266 static struct zone
*aio_workq_zonep
;
267 static lck_mtx_t aio_entry_mtx
;
268 static lck_mtx_t aio_proc_mtx
;
271 aio_entry_lock(__unused aio_workq_entry
*entryp
)
273 lck_mtx_lock(&aio_entry_mtx
);
277 aio_entry_lock_spin(__unused aio_workq_entry
*entryp
)
279 lck_mtx_lock_spin(&aio_entry_mtx
);
283 aio_entry_unlock(__unused aio_workq_entry
*entryp
)
285 lck_mtx_unlock(&aio_entry_mtx
);
290 aio_entry_workq(__unused aio_workq_entry
*entryp
)
292 return &aio_anchor
.aio_async_workqs
[0];
296 aio_entry_mutex(__unused aio_workq_entry
*entryp
)
298 return &aio_entry_mtx
;
302 aio_workq_init(aio_workq_t wq
)
304 TAILQ_INIT(&wq
->aioq_entries
);
306 lck_mtx_init(&wq
->aioq_mtx
, aio_queue_lock_grp
, aio_lock_attr
);
307 waitq_init(&wq
->aioq_waitq
, SYNC_POLICY_FIFO
);
312 * Can be passed a queue which is locked spin.
315 aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
317 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
319 if (entryp
->aio_workq_link
.tqe_prev
== NULL
) {
320 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
323 TAILQ_REMOVE(&queue
->aioq_entries
, entryp
, aio_workq_link
);
325 entryp
->aio_workq_link
.tqe_prev
= NULL
; /* Not on a workq */
327 if (queue
->aioq_count
< 0) {
328 panic("Negative count on a queue.\n");
333 aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
335 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
337 TAILQ_INSERT_TAIL(&queue
->aioq_entries
, entryp
, aio_workq_link
);
338 if (queue
->aioq_count
< 0) {
339 panic("Negative count on a queue.\n");
345 aio_proc_lock(proc_t procp
)
347 lck_mtx_lock(aio_proc_mutex(procp
));
351 aio_proc_lock_spin(proc_t procp
)
353 lck_mtx_lock_spin(aio_proc_mutex(procp
));
357 aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
359 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
361 TAILQ_REMOVE(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
362 TAILQ_INSERT_TAIL( &procp
->p_aio_doneq
, entryp
, aio_proc_link
);
363 procp
->p_aio_active_count
--;
364 OSIncrementAtomic(&aio_anchor
.aio_done_count
);
368 aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
370 TAILQ_REMOVE(&procp
->p_aio_doneq
, entryp
, aio_proc_link
);
371 OSDecrementAtomic(&aio_anchor
.aio_done_count
);
372 aio_decrement_total_count();
373 procp
->p_aio_total_count
--;
377 aio_proc_unlock(proc_t procp
)
379 lck_mtx_unlock(aio_proc_mutex(procp
));
383 aio_proc_mutex(proc_t procp
)
385 return &procp
->p_mlock
;
389 aio_entry_ref_locked(aio_workq_entry
*entryp
)
391 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
393 if (entryp
->aio_refcount
< 0) {
394 panic("AIO workq entry with a negative refcount.\n");
396 entryp
->aio_refcount
++;
400 /* Return 1 if you've freed it */
402 aio_entry_unref_locked(aio_workq_entry
*entryp
)
404 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
406 entryp
->aio_refcount
--;
407 if (entryp
->aio_refcount
< 0) {
408 panic("AIO workq entry with a negative refcount.\n");
413 aio_entry_ref(aio_workq_entry
*entryp
)
415 aio_entry_lock_spin(entryp
);
416 aio_entry_ref_locked(entryp
);
417 aio_entry_unlock(entryp
);
420 aio_entry_unref(aio_workq_entry
*entryp
)
422 aio_entry_lock_spin(entryp
);
423 aio_entry_unref_locked(entryp
);
425 if ((entryp
->aio_refcount
== 0) && ((entryp
->flags
& AIO_DO_FREE
) != 0)) {
426 aio_entry_unlock(entryp
);
427 aio_free_request(entryp
);
429 aio_entry_unlock(entryp
);
436 aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
, int wait_for_completion
, boolean_t disable_notification
)
438 aio_entry_lock_spin(entryp
);
441 aio_entry_ref_locked(entryp
);
442 entryp
->errorval
= ECANCELED
;
443 entryp
->returnval
= -1;
446 if ( wait_for_completion
) {
447 entryp
->flags
|= wait_for_completion
; /* flag for special completion processing */
450 if ( disable_notification
) {
451 entryp
->flags
|= AIO_DISABLE
; /* Don't want a signal */
454 aio_entry_unlock(entryp
);
458 aio_entry_try_workq_remove(aio_workq_entry
*entryp
)
460 /* Can only be cancelled if it's still on a work queue */
461 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
464 /* Will have to check again under the lock */
465 queue
= aio_entry_workq(entryp
);
466 aio_workq_lock_spin(queue
);
467 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
468 aio_workq_remove_entry_locked(queue
, entryp
);
469 aio_workq_unlock(queue
);
472 aio_workq_unlock(queue
);
480 aio_workq_lock_spin(aio_workq_t wq
)
482 lck_mtx_lock_spin(aio_workq_mutex(wq
));
486 aio_workq_unlock(aio_workq_t wq
)
488 lck_mtx_unlock(aio_workq_mutex(wq
));
492 aio_workq_mutex(aio_workq_t wq
)
494 return &wq
->aioq_mtx
;
498 * aio_cancel - attempt to cancel one or more async IO requests currently
499 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
500 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
501 * is NULL then all outstanding async IO request for the given file
502 * descriptor are cancelled (if possible).
505 aio_cancel(proc_t p
, struct aio_cancel_args
*uap
, int *retval
)
507 struct user_aiocb my_aiocb
;
510 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_START
,
511 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
513 /* quick check to see if there are any async IO requests queued up */
514 if (aio_get_all_queues_count() < 1) {
516 *retval
= AIO_ALLDONE
;
521 if ( uap
->aiocbp
!= USER_ADDR_NULL
) {
522 if ( proc_is64bit(p
) ) {
523 struct user64_aiocb aiocb64
;
525 result
= copyin( uap
->aiocbp
, &aiocb64
, sizeof(aiocb64
) );
527 do_munge_aiocb_user64_to_user(&aiocb64
, &my_aiocb
);
530 struct user32_aiocb aiocb32
;
532 result
= copyin( uap
->aiocbp
, &aiocb32
, sizeof(aiocb32
) );
534 do_munge_aiocb_user32_to_user( &aiocb32
, &my_aiocb
);
542 /* NOTE - POSIX standard says a mismatch between the file */
543 /* descriptor passed in and the file descriptor embedded in */
544 /* the aiocb causes unspecified results. We return EBADF in */
545 /* that situation. */
546 if ( uap
->fd
!= my_aiocb
.aio_fildes
) {
553 result
= do_aio_cancel_locked( p
, uap
->fd
, uap
->aiocbp
, 0, FALSE
);
554 ASSERT_AIO_PROC_LOCK_OWNED(p
);
557 if ( result
!= -1 ) {
566 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_END
,
567 (int)p
, (int)uap
->aiocbp
, result
, 0, 0 );
575 * _aio_close - internal function used to clean up async IO requests for
576 * a file descriptor that is closing.
579 __private_extern__
void
580 _aio_close(proc_t p
, int fd
)
584 /* quick check to see if there are any async IO requests queued up */
585 if (aio_get_all_queues_count() < 1) {
589 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_START
,
590 (int)p
, fd
, 0, 0, 0 );
592 /* cancel all async IO requests on our todo queues for this file descriptor */
594 error
= do_aio_cancel_locked( p
, fd
, 0, AIO_CLOSE_WAIT
, FALSE
);
595 ASSERT_AIO_PROC_LOCK_OWNED(p
);
596 if ( error
== AIO_NOTCANCELED
) {
598 * AIO_NOTCANCELED is returned when we find an aio request for this process
599 * and file descriptor on the active async IO queue. Active requests cannot
600 * be cancelled so we must wait for them to complete. We will get a special
601 * wake up call on our channel used to sleep for ALL active requests to
602 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
603 * when we must wait for all active aio requests.
606 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close_sleep
)) | DBG_FUNC_NONE
,
607 (int)p
, fd
, 0, 0, 0 );
609 while (aio_proc_active_requests_for_file(p
, fd
) > 0) {
610 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
, "aio_close", 0 );
617 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_END
,
618 (int)p
, fd
, 0, 0, 0 );
626 * aio_error - return the error status associated with the async IO
627 * request referred to by uap->aiocbp. The error status is the errno
628 * value that would be set by the corresponding IO request (read, wrtie,
629 * fdatasync, or sync).
632 aio_error(proc_t p
, struct aio_error_args
*uap
, int *retval
)
634 aio_workq_entry
*entryp
;
637 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_START
,
638 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
640 /* see if there are any aios to check */
641 if (aio_get_all_queues_count() < 1) {
647 /* look for a match on our queue of async IO requests that have completed */
648 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
649 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
650 ASSERT_AIO_FROM_PROC(entryp
, p
);
652 aio_entry_lock_spin(entryp
);
653 *retval
= entryp
->errorval
;
655 aio_entry_unlock(entryp
);
656 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_val
)) | DBG_FUNC_NONE
,
657 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
662 /* look for a match on our queue of active async IO requests */
663 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
664 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
665 ASSERT_AIO_FROM_PROC(entryp
, p
);
666 *retval
= EINPROGRESS
;
668 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_activeq
)) | DBG_FUNC_NONE
,
669 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
677 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_END
,
678 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
687 * aio_fsync - asynchronously force all IO operations associated
688 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
689 * queued at the time of the call to the synchronized completion state.
690 * NOTE - we do not support op O_DSYNC at this point since we do not support the
694 aio_fsync(proc_t p
, struct aio_fsync_args
*uap
, int *retval
)
699 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_START
,
700 (int)p
, (int)uap
->aiocbp
, uap
->op
, 0, 0 );
703 /* 0 := O_SYNC for binary backward compatibility with Panther */
704 if (uap
->op
== O_SYNC
|| uap
->op
== 0)
705 fsync_kind
= AIO_FSYNC
;
706 else if ( uap
->op
== O_DSYNC
)
707 fsync_kind
= AIO_DSYNC
;
714 error
= aio_queue_async_request( p
, uap
->aiocbp
, fsync_kind
);
719 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_END
,
720 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
727 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
728 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
729 * (uap->aiocbp->aio_buf).
732 aio_read(proc_t p
, struct aio_read_args
*uap
, int *retval
)
736 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_START
,
737 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
741 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_READ
);
745 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_END
,
746 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
754 * aio_return - return the return status associated with the async IO
755 * request referred to by uap->aiocbp. The return status is the value
756 * that would be returned by corresponding IO request (read, write,
757 * fdatasync, or sync). This is where we release kernel resources
758 * held for async IO call associated with the given aiocb pointer.
761 aio_return(proc_t p
, struct aio_return_args
*uap
, user_ssize_t
*retval
)
763 aio_workq_entry
*entryp
;
765 boolean_t proc_lock_held
= FALSE
;
767 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_START
,
768 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
770 /* See if there are any entries to check */
771 if (aio_get_all_queues_count() < 1) {
777 proc_lock_held
= TRUE
;
780 /* look for a match on our queue of async IO requests that have completed */
781 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
782 ASSERT_AIO_FROM_PROC(entryp
, p
);
783 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
784 /* Done and valid for aio_return(), pull it off the list */
785 aio_proc_remove_done_locked(p
, entryp
);
787 /* Drop the proc lock, but keep the entry locked */
788 aio_entry_lock(entryp
);
790 proc_lock_held
= FALSE
;
792 *retval
= entryp
->returnval
;
795 /* No references and off all lists, safe to free */
796 if (entryp
->aio_refcount
== 0) {
797 aio_entry_unlock(entryp
);
798 aio_free_request(entryp
);
801 /* Whoever has the refcount will have to free it */
802 entryp
->flags
|= AIO_DO_FREE
;
803 aio_entry_unlock(entryp
);
807 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_val
)) | DBG_FUNC_NONE
,
808 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
813 /* look for a match on our queue of active async IO requests */
814 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
815 ASSERT_AIO_FROM_PROC(entryp
, p
);
816 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
818 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_activeq
)) | DBG_FUNC_NONE
,
819 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
829 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_END
,
830 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
838 * _aio_exec - internal function used to clean up async IO requests for
839 * a process that is going away due to exec(). We cancel any async IOs
840 * we can and wait for those already active. We also disable signaling
841 * for cancelled or active aio requests that complete.
842 * This routine MAY block!
844 __private_extern__
void
848 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_START
,
849 (int)p
, 0, 0, 0, 0 );
853 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_END
,
854 (int)p
, 0, 0, 0, 0 );
862 * _aio_exit - internal function used to clean up async IO requests for
863 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
864 * we can and wait for those already active. We also disable signaling
865 * for cancelled or active aio requests that complete. This routine MAY block!
867 __private_extern__
void
871 aio_workq_entry
*entryp
;
874 /* quick check to see if there are any async IO requests queued up */
875 if (aio_get_all_queues_count() < 1) {
879 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_START
,
880 (int)p
, 0, 0, 0, 0 );
885 * cancel async IO requests on the todo work queue and wait for those
886 * already active to complete.
888 error
= do_aio_cancel_locked( p
, 0, 0, AIO_EXIT_WAIT
, TRUE
);
889 ASSERT_AIO_PROC_LOCK_OWNED(p
);
890 if ( error
== AIO_NOTCANCELED
) {
892 * AIO_NOTCANCELED is returned when we find an aio request for this process
893 * on the active async IO queue. Active requests cannot be cancelled so we
894 * must wait for them to complete. We will get a special wake up call on
895 * our channel used to sleep for ALL active requests to complete. This sleep
896 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
897 * active aio requests.
900 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit_sleep
)) | DBG_FUNC_NONE
,
901 (int)p
, 0, 0, 0, 0 );
903 while (p
->p_aio_active_count
!= 0) {
904 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
, "aio_exit", 0 );
908 if (p
->p_aio_active_count
!= 0) {
909 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p
->p_aio_active_count
);
912 /* release all aio resources used by this process */
913 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
914 while ( entryp
!= NULL
) {
915 ASSERT_AIO_FROM_PROC(entryp
, p
);
916 aio_workq_entry
*next_entryp
;
918 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
919 aio_proc_remove_done_locked(p
, entryp
);
921 /* we cannot free requests that are still completing */
922 aio_entry_lock_spin(entryp
);
923 if (entryp
->aio_refcount
== 0) {
925 aio_entry_unlock(entryp
);
926 aio_free_request(entryp
);
928 /* need to start over since aio_doneq may have been */
929 /* changed while we were away. */
931 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
935 /* whoever has the reference will have to do the free */
936 entryp
->flags
|= AIO_DO_FREE
;
939 aio_entry_unlock(entryp
);
940 entryp
= next_entryp
;
945 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_END
,
946 (int)p
, 0, 0, 0, 0 );
953 should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
)
955 if ( (aiocbp
== USER_ADDR_NULL
&& fd
== 0) ||
956 (aiocbp
!= USER_ADDR_NULL
&& entryp
->uaiocbp
== aiocbp
) ||
957 (aiocbp
== USER_ADDR_NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
965 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
966 * aio_cancel, close, and at exit.
967 * There are three modes of operation: 1) cancel all async IOs for a process -
968 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
969 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
971 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
972 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
973 * target async IO requests, and AIO_ALLDONE if all target async IO requests
974 * were already complete.
975 * WARNING - do not deference aiocbp in this routine, it may point to user
976 * land data that has not been copied in (when called from aio_cancel() )
978 * Called with proc locked, and returns the same way.
981 do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
,
982 int wait_for_completion
, boolean_t disable_notification
)
984 ASSERT_AIO_PROC_LOCK_OWNED(p
);
986 aio_workq_entry
*entryp
;
991 /* look for a match on our queue of async todo work. */
992 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
993 while ( entryp
!= NULL
) {
994 ASSERT_AIO_FROM_PROC(entryp
, p
);
995 aio_workq_entry
*next_entryp
;
997 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
998 if (!should_cancel(entryp
, aiocbp
, fd
)) {
999 entryp
= next_entryp
;
1003 /* Can only be cancelled if it's still on a work queue */
1004 if (aio_entry_try_workq_remove(entryp
) != 0) {
1005 /* Have removed from workq. Update entry state and take a ref */
1006 aio_entry_update_for_cancel(entryp
, TRUE
, 0, disable_notification
);
1008 /* Put on the proc done queue and update counts, then unlock the proc */
1009 aio_proc_move_done_locked(p
, entryp
);
1012 /* Now it's officially cancelled. Do the completion */
1013 result
= AIO_CANCELED
;
1014 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_async_workq
)) | DBG_FUNC_NONE
,
1015 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1016 do_aio_completion(entryp
);
1018 /* This will free if the aio_return() has already happened ... */
1019 aio_entry_unref(entryp
);
1022 if ( aiocbp
!= USER_ADDR_NULL
) {
1027 * Restart from the head of the proc active queue since it
1028 * may have been changed while we were away doing completion
1031 * Note that if we found an uncancellable AIO before, we will
1032 * either find it again or discover that it's been completed,
1033 * so resetting the result will not cause us to return success
1034 * despite outstanding AIOs.
1036 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
1037 result
= -1; /* As if beginning anew */
1040 * It's been taken off the active queue already, i.e. is in flight.
1041 * All we can do is ask for notification.
1043 result
= AIO_NOTCANCELED
;
1045 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_activeq
)) | DBG_FUNC_NONE
,
1046 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1048 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1049 aio_entry_update_for_cancel(entryp
, FALSE
, wait_for_completion
, disable_notification
);
1051 if ( aiocbp
!= USER_ADDR_NULL
) {
1054 entryp
= next_entryp
;
1059 * if we didn't find any matches on the todo or active queues then look for a
1060 * match on our queue of async IO requests that have completed and if found
1061 * return AIO_ALLDONE result.
1063 * Proc AIO lock is still held.
1065 if ( result
== -1 ) {
1066 TAILQ_FOREACH(entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1067 ASSERT_AIO_FROM_PROC(entryp
, p
);
1068 if (should_cancel(entryp
, aiocbp
, fd
)) {
1069 result
= AIO_ALLDONE
;
1070 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_doneq
)) | DBG_FUNC_NONE
,
1071 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1073 if ( aiocbp
!= USER_ADDR_NULL
) {
1083 /* do_aio_cancel_locked */
1087 * aio_suspend - suspend the calling thread until at least one of the async
1088 * IO operations referenced by uap->aiocblist has completed, until a signal
1089 * interrupts the function, or uap->timeoutp time interval (optional) has
1091 * Returns 0 if one or more async IOs have completed else -1 and errno is
1092 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1096 aio_suspend(proc_t p
, struct aio_suspend_args
*uap
, int *retval
)
1098 __pthread_testcancel(1);
1099 return(aio_suspend_nocancel(p
, (struct aio_suspend_nocancel_args
*)uap
, retval
));
1104 aio_suspend_nocancel(proc_t p
, struct aio_suspend_nocancel_args
*uap
, int *retval
)
1109 struct user_timespec ts
;
1110 aio_workq_entry
*entryp
;
1111 user_addr_t
*aiocbpp
;
1113 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_START
,
1114 (int)p
, uap
->nent
, 0, 0, 0 );
1120 count
= aio_get_all_queues_count( );
1123 goto ExitThisRoutine
;
1126 if ( uap
->nent
< 1 || uap
->nent
> aio_max_requests_per_process
) {
1128 goto ExitThisRoutine
;
1131 if ( uap
->timeoutp
!= USER_ADDR_NULL
) {
1132 if ( proc_is64bit(p
) ) {
1133 struct user64_timespec temp
;
1134 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
) );
1136 ts
.tv_sec
= temp
.tv_sec
;
1137 ts
.tv_nsec
= temp
.tv_nsec
;
1141 struct user32_timespec temp
;
1142 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
) );
1144 ts
.tv_sec
= temp
.tv_sec
;
1145 ts
.tv_nsec
= temp
.tv_nsec
;
1150 goto ExitThisRoutine
;
1153 if ( ts
.tv_sec
< 0 || ts
.tv_nsec
< 0 || ts
.tv_nsec
>= 1000000000 ) {
1155 goto ExitThisRoutine
;
1158 nanoseconds_to_absolutetime( (uint64_t)ts
.tv_sec
* NSEC_PER_SEC
+ ts
.tv_nsec
,
1160 clock_absolutetime_interval_to_deadline( abstime
, &abstime
);
1163 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1164 if ( aiocbpp
== NULL
) {
1166 goto ExitThisRoutine
;
1169 /* check list of aio requests to see if any have completed */
1170 check_for_our_aiocbp
:
1171 aio_proc_lock_spin(p
);
1172 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1175 /* NULL elements are legal so check for 'em */
1176 aiocbp
= *(aiocbpp
+ i
);
1177 if ( aiocbp
== USER_ADDR_NULL
)
1180 /* return immediately if any aio request in the list is done */
1181 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1182 ASSERT_AIO_FROM_PROC(entryp
, p
);
1183 if ( entryp
->uaiocbp
== aiocbp
) {
1187 goto ExitThisRoutine
;
1190 } /* for ( ; i < uap->nent; ) */
1192 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend_sleep
)) | DBG_FUNC_NONE
,
1193 (int)p
, uap
->nent
, 0, 0, 0 );
1196 * wait for an async IO to complete or a signal fires or timeout expires.
1197 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1198 * interrupts us. If an async IO completes before a signal fires or our
1199 * timeout expires, we get a wakeup call from aio_work_thread().
1202 error
= msleep1(&p
->AIO_SUSPEND_SLEEP_CHAN
, aio_proc_mutex(p
), PCATCH
| PWAIT
| PDROP
, "aio_suspend", abstime
); /* XXX better priority? */
1205 * got our wakeup call from aio_work_thread().
1206 * Since we can get a wakeup on this channel from another thread in the
1207 * same process we head back up to make sure this is for the correct aiocbp.
1208 * If it is the correct aiocbp we will return from where we do the check
1209 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1210 * else we will fall out and just sleep again.
1212 goto check_for_our_aiocbp
;
1214 else if ( error
== EWOULDBLOCK
) {
1215 /* our timeout expired */
1219 /* we were interrupted */
1224 if ( aiocbpp
!= NULL
)
1225 FREE( aiocbpp
, M_TEMP
);
1227 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_END
,
1228 (int)p
, uap
->nent
, error
, 0, 0 );
1235 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1236 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1237 * (uap->aiocbp->aio_buf).
1241 aio_write(proc_t p
, struct aio_write_args
*uap
, int *retval
)
1247 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_START
,
1248 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
1250 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_WRITE
);
1254 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_END
,
1255 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
1262 static user_addr_t
*
1263 aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
)
1265 user_addr_t
*aiocbpp
;
1268 /* we reserve enough space for largest possible pointer size */
1269 MALLOC( aiocbpp
, user_addr_t
*, (nent
* sizeof(user_addr_t
)), M_TEMP
, M_WAITOK
);
1270 if ( aiocbpp
== NULL
)
1273 /* copyin our aiocb pointers from list */
1274 result
= copyin( aiocblist
, aiocbpp
,
1275 proc_is64bit(procp
) ? (nent
* sizeof(user64_addr_t
))
1276 : (nent
* sizeof(user32_addr_t
)) );
1278 FREE( aiocbpp
, M_TEMP
);
1284 * We depend on a list of user_addr_t's so we need to
1285 * munge and expand when these pointers came from a
1288 if ( !proc_is64bit(procp
) ) {
1289 /* copy from last to first to deal with overlap */
1290 user32_addr_t
*my_ptrp
= ((user32_addr_t
*)aiocbpp
) + (nent
- 1);
1291 user_addr_t
*my_addrp
= aiocbpp
+ (nent
- 1);
1293 for (i
= 0; i
< nent
; i
++, my_ptrp
--, my_addrp
--) {
1294 *my_addrp
= (user_addr_t
) (*my_ptrp
);
1304 aio_copy_in_sigev(proc_t procp
, user_addr_t sigp
, struct user_sigevent
*sigev
)
1308 if (sigp
== USER_ADDR_NULL
)
1312 * We need to munge aio_sigevent since it contains pointers.
1313 * Since we do not know if sigev_value is an int or a ptr we do
1314 * NOT cast the ptr to a user_addr_t. This means if we send
1315 * this info back to user space we need to remember sigev_value
1316 * was not expanded for the 32-bit case.
1318 * Notes: This does NOT affect us since we don't support
1319 * sigev_value yet in the aio context.
1321 if ( proc_is64bit(procp
) ) {
1322 struct user64_sigevent sigevent64
;
1324 result
= copyin( sigp
, &sigevent64
, sizeof(sigevent64
) );
1325 if ( result
== 0 ) {
1326 sigev
->sigev_notify
= sigevent64
.sigev_notify
;
1327 sigev
->sigev_signo
= sigevent64
.sigev_signo
;
1328 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent64
.sigev_value
.size_equivalent
.sival_int
;
1329 sigev
->sigev_notify_function
= sigevent64
.sigev_notify_function
;
1330 sigev
->sigev_notify_attributes
= sigevent64
.sigev_notify_attributes
;
1334 struct user32_sigevent sigevent32
;
1336 result
= copyin( sigp
, &sigevent32
, sizeof(sigevent32
) );
1337 if ( result
== 0 ) {
1338 sigev
->sigev_notify
= sigevent32
.sigev_notify
;
1339 sigev
->sigev_signo
= sigevent32
.sigev_signo
;
1340 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent32
.sigev_value
.sival_int
;
1341 sigev
->sigev_notify_function
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_function
);
1342 sigev
->sigev_notify_attributes
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_attributes
);
1346 if ( result
!= 0 ) {
1357 * Queue up the entry on the aio asynchronous work queue in priority order
1358 * based on the relative priority of the request. We calculate the relative
1359 * priority using the nice value of the caller and the value
1361 * Parameters: procp Process queueing the I/O
1362 * entryp The work queue entry being queued
1364 * Returns: (void) No failure modes
1366 * Notes: This function is used for both lio_listio and aio
1368 * XXX: At some point, we may have to consider thread priority
1369 * rather than process priority, but we don't maintain the
1370 * adjusted priority for threads the POSIX way.
1373 * Called with proc locked.
1376 aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
)
1379 aio_workq_entry
*my_entryp
; /* used for insertion sort */
1381 aio_workq_t queue
= aio_entry_workq(entryp
);
1383 if (proc_locked
== 0) {
1384 aio_proc_lock(procp
);
1387 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
1389 /* Onto proc queue */
1390 TAILQ_INSERT_TAIL(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
1391 procp
->p_aio_active_count
++;
1392 procp
->p_aio_total_count
++;
1394 /* And work queue */
1395 aio_workq_lock_spin(queue
);
1396 aio_workq_add_entry_locked(queue
, entryp
);
1397 waitq_wakeup64_one(&queue
->aioq_waitq
, CAST_EVENT64_T(queue
),
1398 THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
1399 aio_workq_unlock(queue
);
1401 if (proc_locked
== 0) {
1402 aio_proc_unlock(procp
);
1409 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1410 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1411 * which is [0..39], with 0 not being used. In nice values, the
1412 * lower the nice value, the higher the priority.
1413 * (3) The normalized scheduling prioritiy is the highest nice value
1414 * minus the current nice value. In I/O scheduling priority, the
1415 * higher the value the lower the priority, so it is the inverse
1416 * of the nice value (the higher the number, the higher the I/O
1418 * (4) From the normalized scheduling priority, we subtract the
1419 * request priority to get the request priority value number;
1420 * this means that requests are only capable of depressing their
1421 * priority relative to other requests,
1423 entryp
->priority
= (((2 * NZERO
) - 1) - procp
->p_nice
);
1425 /* only premit depressing the priority */
1426 if (entryp
->aiocb
.aio_reqprio
< 0)
1427 entryp
->aiocb
.aio_reqprio
= 0;
1428 if (entryp
->aiocb
.aio_reqprio
> 0) {
1429 entryp
->priority
-= entryp
->aiocb
.aio_reqprio
;
1430 if (entryp
->priority
< 0)
1431 entryp
->priority
= 0;
1434 /* Insertion sort the entry; lowest ->priority to highest */
1435 TAILQ_FOREACH(my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1436 if ( entryp
->priority
<= my_entryp
->priority
) {
1437 TAILQ_INSERT_BEFORE(my_entryp
, entryp
, aio_workq_link
);
1441 if (my_entryp
== NULL
)
1442 TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1448 * lio_listio - initiate a list of IO requests. We process the list of
1449 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1450 * (mode == LIO_NOWAIT).
1452 * The caller gets error and return status for each aiocb in the list
1453 * via aio_error and aio_return. We must keep completed requests until
1454 * released by the aio_return call.
1457 lio_listio(proc_t p
, struct lio_listio_args
*uap
, int *retval
)
1463 aio_workq_entry
**entryp_listp
;
1464 user_addr_t
*aiocbpp
;
1465 struct user_sigevent aiosigev
;
1466 aio_lio_context
*lio_context
;
1467 boolean_t free_context
= FALSE
;
1468 uint32_t *paio_offset
;
1469 uint32_t *paio_nbytes
;
1471 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_START
,
1472 (int)p
, uap
->nent
, uap
->mode
, 0, 0 );
1474 entryp_listp
= NULL
;
1479 if ( !(uap
->mode
== LIO_NOWAIT
|| uap
->mode
== LIO_WAIT
) ) {
1480 call_result
= EINVAL
;
1484 if ( uap
->nent
< 1 || uap
->nent
> AIO_LISTIO_MAX
) {
1485 call_result
= EINVAL
;
1490 * allocate a list of aio_workq_entry pointers that we will use
1491 * to queue up all our requests at once while holding our lock.
1493 MALLOC( entryp_listp
, void *, (uap
->nent
* sizeof(aio_workq_entry
*)), M_TEMP
, M_WAITOK
);
1494 if ( entryp_listp
== NULL
) {
1495 call_result
= EAGAIN
;
1499 MALLOC( lio_context
, aio_lio_context
*, sizeof(aio_lio_context
), M_TEMP
, M_WAITOK
);
1500 if ( lio_context
== NULL
) {
1501 call_result
= EAGAIN
;
1506 OSIncrementAtomic(&lio_contexts_alloced
);
1509 free_context
= TRUE
;
1510 bzero(lio_context
, sizeof(aio_lio_context
));
1512 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1513 if ( aiocbpp
== NULL
) {
1514 call_result
= EAGAIN
;
1519 * Use sigevent passed in to lio_listio for each of our calls, but
1520 * only do completion notification after the last request completes.
1522 bzero(&aiosigev
, sizeof(aiosigev
));
1523 /* Only copy in an sigev if the user supplied one */
1524 if (uap
->sigp
!= USER_ADDR_NULL
) {
1525 call_result
= aio_copy_in_sigev(p
, uap
->sigp
, &aiosigev
);
1530 /* process list of aio requests */
1531 free_context
= FALSE
;
1532 lio_context
->io_issued
= uap
->nent
;
1533 lio_context
->io_waiter
= uap
->mode
== LIO_WAIT
? 1 : 0; /* Should it be freed by last AIO */
1534 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1535 user_addr_t my_aiocbp
;
1536 aio_workq_entry
*entryp
;
1538 *(entryp_listp
+ i
) = NULL
;
1539 my_aiocbp
= *(aiocbpp
+ i
);
1541 /* NULL elements are legal so check for 'em */
1542 if ( my_aiocbp
== USER_ADDR_NULL
) {
1543 aio_proc_lock_spin(p
);
1544 lio_context
->io_issued
--;
1550 * We use lio_context to mark IO requests for delayed completion
1551 * processing which means we wait until all IO requests in the
1552 * group have completed before we either return to the caller
1553 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1555 * We use the address of the lio_context for this, since it is
1556 * unique in the address space.
1558 result
= lio_create_entry( p
, my_aiocbp
, lio_context
, (entryp_listp
+ i
) );
1559 if ( result
!= 0 && call_result
== -1 )
1560 call_result
= result
;
1562 /* NULL elements are legal so check for 'em */
1563 entryp
= *(entryp_listp
+ i
);
1564 if ( entryp
== NULL
) {
1565 aio_proc_lock_spin(p
);
1566 lio_context
->io_issued
--;
1571 if ( uap
->mode
== LIO_NOWAIT
) {
1572 /* Set signal hander, if any */
1573 entryp
->aiocb
.aio_sigevent
= aiosigev
;
1575 /* flag that this thread blocks pending completion */
1576 entryp
->flags
|= AIO_LIO_NOTIFY
;
1579 /* check our aio limits to throttle bad or rude user land behavior */
1580 old_count
= aio_increment_total_count();
1582 aio_proc_lock_spin(p
);
1583 if ( old_count
>= aio_max_requests
||
1584 aio_get_process_count( entryp
->procp
) >= aio_max_requests_per_process
||
1585 is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1587 lio_context
->io_issued
--;
1590 aio_decrement_total_count();
1592 if ( call_result
== -1 )
1593 call_result
= EAGAIN
;
1594 aio_free_request(entryp
);
1595 entryp_listp
[i
] = NULL
;
1599 lck_mtx_convert_spin(aio_proc_mutex(p
));
1600 aio_enqueue_work(p
, entryp
, 1);
1603 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_START
,
1604 (int)p
, (int)entryp
->uaiocbp
, entryp
->flags
, entryp
->aiocb
.aio_fildes
, 0 );
1605 paio_offset
= (uint32_t*) &entryp
->aiocb
.aio_offset
;
1606 paio_nbytes
= (uint32_t*) &entryp
->aiocb
.aio_nbytes
;
1607 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_END
,
1608 paio_offset
[0], (sizeof(entryp
->aiocb
.aio_offset
) == sizeof(uint64_t) ? paio_offset
[1] : 0),
1609 paio_nbytes
[0], (sizeof(entryp
->aiocb
.aio_nbytes
) == sizeof(uint64_t) ? paio_nbytes
[1] : 0),
1615 aio_proc_lock_spin(p
);
1616 while (lio_context
->io_completed
< lio_context
->io_issued
) {
1617 result
= msleep(lio_context
, aio_proc_mutex(p
), PCATCH
| PRIBIO
| PSPIN
, "lio_listio", 0);
1619 /* If we were interrupted, fail out (even if all finished) */
1621 call_result
= EINTR
;
1622 lio_context
->io_waiter
= 0;
1627 /* If all IOs have finished must free it */
1628 if (lio_context
->io_completed
== lio_context
->io_issued
) {
1629 free_context
= TRUE
;
1639 /* call_result == -1 means we had no trouble queueing up requests */
1640 if ( call_result
== -1 ) {
1646 if ( entryp_listp
!= NULL
)
1647 FREE( entryp_listp
, M_TEMP
);
1648 if ( aiocbpp
!= NULL
)
1649 FREE( aiocbpp
, M_TEMP
);
1651 free_lio_context(lio_context
);
1654 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_END
,
1655 (int)p
, call_result
, 0, 0, 0 );
1657 return( call_result
);
1663 * aio worker thread. this is where all the real work gets done.
1664 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1665 * after new work is queued up.
1667 __attribute__((noreturn
))
1669 aio_work_thread(void)
1671 aio_workq_entry
*entryp
;
1673 vm_map_t currentmap
;
1674 vm_map_t oldmap
= VM_MAP_NULL
;
1675 task_t oldaiotask
= TASK_NULL
;
1676 struct uthread
*uthreadp
= NULL
;
1680 * returns with the entry ref'ed.
1681 * sleeps until work is available.
1683 entryp
= aio_get_some_work();
1685 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_START
,
1686 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->flags
, 0, 0 );
1689 * Assume the target's address space identity for the duration
1690 * of the IO. Note: don't need to have the entryp locked,
1691 * because the proc and map don't change until it's freed.
1693 currentmap
= get_task_map( (current_proc())->task
);
1694 if ( currentmap
!= entryp
->aio_map
) {
1695 uthreadp
= (struct uthread
*) get_bsdthread_info(current_thread());
1696 oldaiotask
= uthreadp
->uu_aio_task
;
1697 uthreadp
->uu_aio_task
= entryp
->procp
->task
;
1698 oldmap
= vm_map_switch( entryp
->aio_map
);
1701 if ( (entryp
->flags
& AIO_READ
) != 0 ) {
1702 error
= do_aio_read( entryp
);
1704 else if ( (entryp
->flags
& AIO_WRITE
) != 0 ) {
1705 error
= do_aio_write( entryp
);
1707 else if ( (entryp
->flags
& (AIO_FSYNC
| AIO_DSYNC
)) != 0 ) {
1708 error
= do_aio_fsync( entryp
);
1711 printf( "%s - unknown aio request - flags 0x%02X \n",
1712 __FUNCTION__
, entryp
->flags
);
1716 /* Restore old map */
1717 if ( currentmap
!= entryp
->aio_map
) {
1718 (void) vm_map_switch( oldmap
);
1719 uthreadp
->uu_aio_task
= oldaiotask
;
1722 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_END
,
1723 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->errorval
,
1724 entryp
->returnval
, 0 );
1728 aio_entry_lock_spin(entryp
);
1729 entryp
->errorval
= error
;
1730 aio_entry_unlock(entryp
);
1732 /* we're done with the IO request so pop it off the active queue and */
1733 /* push it on the done queue */
1734 aio_proc_lock(entryp
->procp
);
1735 aio_proc_move_done_locked(entryp
->procp
, entryp
);
1736 aio_proc_unlock(entryp
->procp
);
1738 OSDecrementAtomic(&aio_anchor
.aio_inflight_count
);
1740 /* remove our reference to the user land map. */
1741 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
1744 my_map
= entryp
->aio_map
;
1745 entryp
->aio_map
= VM_MAP_NULL
;
1746 vm_map_deallocate( my_map
);
1749 /* Provide notifications */
1750 do_aio_completion( entryp
);
1752 /* Will free if needed */
1753 aio_entry_unref(entryp
);
1759 } /* aio_work_thread */
1763 * aio_get_some_work - get the next async IO request that is ready to be executed.
1764 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1765 * IO requests at the time the aio_fsync call came in have completed.
1766 * NOTE - AIO_LOCK must be held by caller
1768 static aio_workq_entry
*
1769 aio_get_some_work( void )
1771 aio_workq_entry
*entryp
= NULL
;
1772 aio_workq_t queue
= NULL
;
1774 /* Just one queue for the moment. In the future there will be many. */
1775 queue
= &aio_anchor
.aio_async_workqs
[0];
1776 aio_workq_lock_spin(queue
);
1777 if (queue
->aioq_count
== 0) {
1782 * Hold the queue lock.
1784 * pop some work off the work queue and add to our active queue
1785 * Always start with the queue lock held.
1789 * Pull of of work queue. Once it's off, it can't be cancelled,
1790 * so we can take our ref once we drop the queue lock.
1792 entryp
= TAILQ_FIRST(&queue
->aioq_entries
);
1795 * If there's no work or only fsyncs that need delay, go to sleep
1796 * and then start anew from aio_work_thread
1798 if (entryp
== NULL
) {
1802 aio_workq_remove_entry_locked(queue
, entryp
);
1804 aio_workq_unlock(queue
);
1807 * Check if it's an fsync that must be delayed. No need to lock the entry;
1808 * that flag would have been set at initialization.
1810 if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1812 * Check for unfinished operations on the same file
1813 * in this proc's queue.
1815 aio_proc_lock_spin(entryp
->procp
);
1816 if ( aio_delay_fsync_request( entryp
) ) {
1817 /* It needs to be delayed. Put it back on the end of the work queue */
1818 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync_delay
)) | DBG_FUNC_NONE
,
1819 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1821 aio_proc_unlock(entryp
->procp
);
1823 aio_workq_lock_spin(queue
);
1824 aio_workq_add_entry_locked(queue
, entryp
);
1827 aio_proc_unlock(entryp
->procp
);
1833 aio_entry_ref(entryp
);
1835 OSIncrementAtomic(&aio_anchor
.aio_inflight_count
);
1839 /* We will wake up when someone enqueues something */
1840 waitq_assert_wait64(&queue
->aioq_waitq
, CAST_EVENT64_T(queue
), THREAD_UNINT
, 0);
1841 aio_workq_unlock(queue
);
1842 thread_block( (thread_continue_t
)aio_work_thread
);
1849 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1850 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1851 * not been completed.
1854 aio_delay_fsync_request( aio_workq_entry
*entryp
)
1856 if (entryp
== TAILQ_FIRST(&entryp
->procp
->p_aio_activeq
)) {
1861 } /* aio_delay_fsync_request */
1863 static aio_workq_entry
*
1864 aio_create_queue_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
, int kindOfIO
)
1866 aio_workq_entry
*entryp
;
1869 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1870 if ( entryp
== NULL
) {
1875 bzero( entryp
, sizeof(*entryp
) );
1877 /* fill in the rest of the aio_workq_entry */
1878 entryp
->procp
= procp
;
1879 entryp
->uaiocbp
= aiocbp
;
1880 entryp
->flags
|= kindOfIO
;
1881 entryp
->group_tag
= group_tag
;
1882 entryp
->aio_map
= VM_MAP_NULL
;
1883 entryp
->aio_refcount
= 0;
1885 if ( proc_is64bit(procp
) ) {
1886 struct user64_aiocb aiocb64
;
1888 result
= copyin( aiocbp
, &aiocb64
, sizeof(aiocb64
) );
1890 do_munge_aiocb_user64_to_user(&aiocb64
, &entryp
->aiocb
);
1893 struct user32_aiocb aiocb32
;
1895 result
= copyin( aiocbp
, &aiocb32
, sizeof(aiocb32
) );
1897 do_munge_aiocb_user32_to_user( &aiocb32
, &entryp
->aiocb
);
1900 if ( result
!= 0 ) {
1905 /* get a reference to the user land map in order to keep it around */
1906 entryp
->aio_map
= get_task_map( procp
->task
);
1907 vm_map_reference( entryp
->aio_map
);
1909 /* do some more validation on the aiocb and embedded file descriptor */
1910 result
= aio_validate( entryp
);
1912 goto error_exit_with_ref
;
1914 /* get a reference on the current_thread, which is passed in vfs_context. */
1915 entryp
->thread
= current_thread();
1916 thread_reference( entryp
->thread
);
1919 error_exit_with_ref
:
1920 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
1921 vm_map_deallocate( entryp
->aio_map
);
1924 if ( result
&& entryp
!= NULL
) {
1925 zfree( aio_workq_zonep
, entryp
);
1934 * aio_queue_async_request - queue up an async IO request on our work queue then
1935 * wake up one of our worker threads to do the actual work. We get a reference
1936 * to our caller's user land map in order to keep it around while we are
1937 * processing the request.
1940 aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
)
1942 aio_workq_entry
*entryp
;
1945 uint32_t *paio_offset
;
1946 uint32_t *paio_nbytes
;
1948 old_count
= aio_increment_total_count();
1949 if (old_count
>= aio_max_requests
) {
1954 entryp
= aio_create_queue_entry( procp
, aiocbp
, 0, kindOfIO
);
1955 if ( entryp
== NULL
) {
1961 aio_proc_lock_spin(procp
);
1963 if ( is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1968 /* check our aio limits to throttle bad or rude user land behavior */
1969 if (aio_get_process_count( procp
) >= aio_max_requests_per_process
) {
1970 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp
->p_aio_total_count
);
1975 /* Add the IO to proc and work queues, wake up threads as appropriate */
1976 lck_mtx_convert_spin(aio_proc_mutex(procp
));
1977 aio_enqueue_work(procp
, entryp
, 1);
1979 aio_proc_unlock(procp
);
1981 paio_offset
= (uint32_t*) &entryp
->aiocb
.aio_offset
;
1982 paio_nbytes
= (uint32_t*) &entryp
->aiocb
.aio_nbytes
;
1983 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_START
,
1984 (int)procp
, (int)aiocbp
, entryp
->flags
, entryp
->aiocb
.aio_fildes
, 0 );
1985 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_END
,
1986 paio_offset
[0], (sizeof(entryp
->aiocb
.aio_offset
) == sizeof(uint64_t) ? paio_offset
[1] : 0),
1987 paio_nbytes
[0], (sizeof(entryp
->aiocb
.aio_nbytes
) == sizeof(uint64_t) ? paio_nbytes
[1] : 0),
1994 * This entry has not been queued up so no worries about
1995 * unlocked state and aio_map
1997 aio_proc_unlock(procp
);
1998 aio_free_request(entryp
);
2001 aio_decrement_total_count();
2005 } /* aio_queue_async_request */
2011 * Allocate an aio_workq_entry and fill it in. If all goes well return 0
2012 * and pass the aio_workq_entry pointer back to our caller.
2014 * Parameters: procp The process makign the request
2015 * aiocbp The aio context buffer pointer
2016 * group_tag The group tag used to indicate a
2017 * group of operations has completed
2018 * entrypp Pointer to the pointer to receive the
2019 * address of the created aio_workq_entry
2021 * Returns: 0 Successfully created
2022 * EAGAIN Try again (usually resource shortage)
2025 * Notes: We get a reference to our caller's user land map in order
2026 * to keep it around while we are processing the request.
2028 * lio_listio calls behave differently at completion they do
2029 * completion notification when all async IO requests have
2030 * completed. We use group_tag to tag IO requests that behave
2031 * in the delay notification manner.
2033 * All synchronous operations are considered to not have a
2034 * signal routine associated with them (sigp == USER_ADDR_NULL).
2037 lio_create_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
,
2038 aio_workq_entry
**entrypp
)
2040 aio_workq_entry
*entryp
;
2043 entryp
= aio_create_queue_entry( procp
, aiocbp
, group_tag
, AIO_LIO
);
2044 if ( entryp
== NULL
) {
2050 * Look for lio_listio LIO_NOP requests and ignore them; this is
2051 * not really an error, but we need to free our aio_workq_entry.
2053 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
2063 if ( entryp
!= NULL
) {
2065 * This entry has not been queued up so no worries about
2066 * unlocked state and aio_map
2068 aio_free_request(entryp
);
2073 } /* lio_create_entry */
2077 * aio_free_request - remove our reference on the user land map and
2078 * free the work queue entry resources. The entry is off all lists
2079 * and has zero refcount, so no one can have a pointer to it.
2083 aio_free_request(aio_workq_entry
*entryp
)
2085 /* remove our reference to the user land map. */
2086 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
2087 vm_map_deallocate(entryp
->aio_map
);
2090 /* remove our reference to thread which enqueued the request */
2091 if ( NULL
!= entryp
->thread
) {
2092 thread_deallocate( entryp
->thread
);
2095 entryp
->aio_refcount
= -1; /* A bit of poisoning in case of bad refcounting. */
2097 zfree( aio_workq_zonep
, entryp
);
2101 } /* aio_free_request */
2107 * validate the aiocb passed in by one of the aio syscalls.
2110 aio_validate( aio_workq_entry
*entryp
)
2112 struct fileproc
*fp
;
2118 if ( (entryp
->flags
& AIO_LIO
) != 0 ) {
2119 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_READ
)
2120 entryp
->flags
|= AIO_READ
;
2121 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_WRITE
)
2122 entryp
->flags
|= AIO_WRITE
;
2123 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
)
2130 if ( (entryp
->flags
& (AIO_WRITE
| AIO_FSYNC
| AIO_DSYNC
)) != 0 ) {
2134 if ( (entryp
->flags
& (AIO_READ
| AIO_WRITE
)) != 0 ) {
2135 if ( entryp
->aiocb
.aio_nbytes
> INT_MAX
||
2136 entryp
->aiocb
.aio_buf
== USER_ADDR_NULL
||
2137 entryp
->aiocb
.aio_offset
< 0 )
2142 * validate aiocb.aio_sigevent. at this point we only support
2143 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
2144 * sigev_value, sigev_notify_function, and sigev_notify_attributes
2145 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
2146 * with no [RTS] (RalTime Signal) option group support.
2148 switch ( entryp
->aiocb
.aio_sigevent
.sigev_notify
) {
2153 /* make sure we have a valid signal number */
2154 signum
= entryp
->aiocb
.aio_sigevent
.sigev_signo
;
2155 if ( signum
<= 0 || signum
>= NSIG
||
2156 signum
== SIGKILL
|| signum
== SIGSTOP
)
2165 /* Unsupported [RTS] */
2171 /* validate the file descriptor and that the file was opened
2172 * for the appropriate read / write access.
2174 proc_fdlock(entryp
->procp
);
2176 result
= fp_lookup( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 1);
2177 if ( result
== 0 ) {
2178 if ( (fp
->f_fglob
->fg_flag
& flag
) == 0 ) {
2179 /* we don't have read or write access */
2182 else if ( FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_VNODE
) {
2183 /* this is not a file */
2186 fp
->f_flags
|= FP_AIOISSUED
;
2188 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 1);
2194 proc_fdunlock(entryp
->procp
);
2198 } /* aio_validate */
2201 aio_increment_total_count()
2203 return OSIncrementAtomic(&aio_anchor
.aio_total_count
);
2207 aio_decrement_total_count()
2209 int old
= OSDecrementAtomic(&aio_anchor
.aio_total_count
);
2211 panic("Negative total AIO count!\n");
2218 aio_get_process_count(proc_t procp
)
2220 return procp
->p_aio_total_count
;
2222 } /* aio_get_process_count */
2225 aio_get_all_queues_count( void )
2227 return aio_anchor
.aio_total_count
;
2229 } /* aio_get_all_queues_count */
2233 * do_aio_completion. Handle async IO completion.
2236 do_aio_completion( aio_workq_entry
*entryp
)
2239 boolean_t lastLioCompleted
= FALSE
;
2240 aio_lio_context
*lio_context
= NULL
;
2243 lio_context
= (aio_lio_context
*)entryp
->group_tag
;
2245 if (lio_context
!= NULL
) {
2247 aio_proc_lock_spin(entryp
->procp
);
2249 /* Account for this I/O completing. */
2250 lio_context
->io_completed
++;
2252 /* Are we done with this lio context? */
2253 if (lio_context
->io_issued
== lio_context
->io_completed
) {
2254 lastLioCompleted
= TRUE
;
2257 waiter
= lio_context
->io_waiter
;
2259 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2260 if ((entryp
->flags
& AIO_LIO_NOTIFY
) && (lastLioCompleted
) && (waiter
!= 0)) {
2261 /* wake up the waiter */
2262 wakeup(lio_context
);
2265 aio_proc_unlock(entryp
->procp
);
2268 if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
== SIGEV_SIGNAL
&&
2269 (entryp
->flags
& AIO_DISABLE
) == 0 ) {
2271 boolean_t performSignal
= FALSE
;
2272 if (lio_context
== NULL
) {
2273 performSignal
= TRUE
;
2277 * If this was the last request in the group and a signal
2278 * is desired, send one.
2280 performSignal
= lastLioCompleted
;
2283 if (performSignal
) {
2285 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_sig
)) | DBG_FUNC_NONE
,
2286 (int)entryp
->procp
, (int)entryp
->uaiocbp
,
2287 entryp
->aiocb
.aio_sigevent
.sigev_signo
, 0, 0 );
2289 psignal( entryp
->procp
, entryp
->aiocb
.aio_sigevent
.sigev_signo
);
2293 if ((entryp
->flags
& AIO_EXIT_WAIT
) && (entryp
->flags
& AIO_CLOSE_WAIT
)) {
2294 panic("Close and exit flags set at the same time\n");
2298 * need to handle case where a process is trying to exit, exec, or
2299 * close and is currently waiting for active aio requests to complete.
2300 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2301 * other requests in the active queue for this process. If there are
2302 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2303 * If there are some still active then do nothing - we only want to
2304 * wakeup when all active aio requests for the process are complete.
2306 * Don't need to lock the entry or proc to check the cleanup flag. It can only be
2307 * set for cancellation, while the entryp is still on a proc list; now it's
2308 * off, so that flag is already set if it's going to be.
2310 if ( (entryp
->flags
& AIO_EXIT_WAIT
) != 0 ) {
2311 int active_requests
;
2313 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2314 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2316 aio_proc_lock_spin(entryp
->procp
);
2317 active_requests
= aio_active_requests_for_process( entryp
->procp
);
2318 if ( active_requests
< 1 ) {
2320 * no active aio requests for this process, continue exiting. In this
2321 * case, there should be no one else waiting ont he proc in AIO...
2323 wakeup_one((caddr_t
)&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2324 aio_proc_unlock(entryp
->procp
);
2326 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2327 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2329 aio_proc_unlock(entryp
->procp
);
2333 if ( (entryp
->flags
& AIO_CLOSE_WAIT
) != 0 ) {
2334 int active_requests
;
2336 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2337 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2339 aio_proc_lock_spin(entryp
->procp
);
2340 active_requests
= aio_proc_active_requests_for_file( entryp
->procp
, entryp
->aiocb
.aio_fildes
);
2341 if ( active_requests
< 1 ) {
2342 /* Can't wakeup_one(); multiple closes might be in progress. */
2343 wakeup(&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2344 aio_proc_unlock(entryp
->procp
);
2346 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2347 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2349 aio_proc_unlock(entryp
->procp
);
2353 * A thread in aio_suspend() wants to known about completed IOs. If it checked
2354 * the done list before we moved our AIO there, then it already asserted its wait,
2355 * and we can wake it up without holding the lock. If it checked the list after
2356 * we did our move, then it already has seen the AIO that we moved. Herego, we
2357 * can do our wakeup without holding the lock.
2359 wakeup( (caddr_t
) &entryp
->procp
->AIO_SUSPEND_SLEEP_CHAN
);
2360 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_suspend_wake
)) | DBG_FUNC_NONE
,
2361 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2364 * free the LIO context if the last lio completed and no thread is
2367 if (lastLioCompleted
&& (waiter
== 0))
2368 free_lio_context (lio_context
);
2371 } /* do_aio_completion */
2378 do_aio_read( aio_workq_entry
*entryp
)
2380 struct fileproc
*fp
;
2382 struct vfs_context context
;
2384 if ( (error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0)) )
2386 if ( (fp
->f_fglob
->fg_flag
& FREAD
) == 0 ) {
2387 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2391 context
.vc_thread
= entryp
->thread
; /* XXX */
2392 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2394 error
= dofileread(&context
, fp
,
2395 entryp
->aiocb
.aio_buf
,
2396 entryp
->aiocb
.aio_nbytes
,
2397 entryp
->aiocb
.aio_offset
, FOF_OFFSET
,
2398 &entryp
->returnval
);
2399 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2410 do_aio_write( aio_workq_entry
*entryp
)
2412 struct fileproc
*fp
;
2414 struct vfs_context context
;
2416 if ( (error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0)) )
2418 if ( (fp
->f_fglob
->fg_flag
& FWRITE
) == 0 ) {
2419 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2424 if ( (fp
->f_fglob
->fg_flag
& O_APPEND
) == 0 ) {
2425 flags
|= FOF_OFFSET
;
2428 context
.vc_thread
= entryp
->thread
; /* XXX */
2429 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2431 /* NB: tell dofilewrite the offset, and to use the proc cred */
2432 error
= dofilewrite(&context
,
2434 entryp
->aiocb
.aio_buf
,
2435 entryp
->aiocb
.aio_nbytes
,
2436 entryp
->aiocb
.aio_offset
,
2438 &entryp
->returnval
);
2440 if (entryp
->returnval
)
2441 fp_drop_written(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
);
2443 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2447 } /* do_aio_write */
2451 * aio_active_requests_for_process - return number of active async IO
2452 * requests for the given process.
2455 aio_active_requests_for_process(proc_t procp
)
2457 return( procp
->p_aio_active_count
);
2459 } /* aio_active_requests_for_process */
2462 * Called with the proc locked.
2465 aio_proc_active_requests_for_file(proc_t procp
, int fd
)
2468 aio_workq_entry
*entryp
;
2469 TAILQ_FOREACH(entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2470 if (entryp
->aiocb
.aio_fildes
== fd
) {
2476 } /* aio_active_requests_for_process */
2484 do_aio_fsync( aio_workq_entry
*entryp
)
2486 struct vfs_context context
;
2488 struct fileproc
*fp
;
2493 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2495 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2496 * to mark for update the metadata not strictly necessary for data
2497 * retrieval, rather than forcing it to disk.
2499 * If AIO_FSYNC is set, we have to also wait for metadata not really
2500 * necessary to data retrival are committed to stable storage (e.g.
2501 * atime, mtime, ctime, etc.).
2503 * Metadata necessary for data retrieval ust be committed to stable
2504 * storage in either case (file length, etc.).
2506 if (entryp
->flags
& AIO_FSYNC
)
2507 sync_flag
= MNT_WAIT
;
2509 sync_flag
= MNT_DWAIT
;
2511 error
= fp_getfvp( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, &vp
);
2513 if ( (error
= vnode_getwithref(vp
)) ) {
2514 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2515 entryp
->returnval
= -1;
2518 context
.vc_thread
= current_thread();
2519 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2521 error
= VNOP_FSYNC( vp
, sync_flag
, &context
);
2523 (void)vnode_put(vp
);
2525 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2528 entryp
->returnval
= -1;
2532 } /* do_aio_fsync */
2536 * is_already_queued - runs through our queues to see if the given
2537 * aiocbp / process is there. Returns TRUE if there is a match
2538 * on any of our aio queues.
2540 * Called with proc aio lock held (can be held spin)
2543 is_already_queued(proc_t procp
,
2544 user_addr_t aiocbp
)
2546 aio_workq_entry
*entryp
;
2551 /* look for matches on our queue of async IO requests that have completed */
2552 TAILQ_FOREACH( entryp
, &procp
->p_aio_doneq
, aio_proc_link
) {
2553 if ( aiocbp
== entryp
->uaiocbp
) {
2555 goto ExitThisRoutine
;
2559 /* look for matches on our queue of active async IO requests */
2560 TAILQ_FOREACH( entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2561 if ( aiocbp
== entryp
->uaiocbp
) {
2563 goto ExitThisRoutine
;
2570 } /* is_already_queued */
2574 free_lio_context(aio_lio_context
* context
)
2578 OSDecrementAtomic(&lio_contexts_alloced
);
2581 FREE( context
, M_TEMP
);
2583 } /* free_lio_context */
2587 * aio initialization
2589 __private_extern__
void
2594 aio_lock_grp_attr
= lck_grp_attr_alloc_init();
2595 aio_proc_lock_grp
= lck_grp_alloc_init("aio_proc", aio_lock_grp_attr
);;
2596 aio_entry_lock_grp
= lck_grp_alloc_init("aio_entry", aio_lock_grp_attr
);;
2597 aio_queue_lock_grp
= lck_grp_alloc_init("aio_queue", aio_lock_grp_attr
);;
2598 aio_lock_attr
= lck_attr_alloc_init();
2600 lck_mtx_init(&aio_entry_mtx
, aio_entry_lock_grp
, aio_lock_attr
);
2601 lck_mtx_init(&aio_proc_mtx
, aio_proc_lock_grp
, aio_lock_attr
);
2603 aio_anchor
.aio_inflight_count
= 0;
2604 aio_anchor
.aio_done_count
= 0;
2605 aio_anchor
.aio_total_count
= 0;
2606 aio_anchor
.aio_num_workqs
= AIO_NUM_WORK_QUEUES
;
2608 for (i
= 0; i
< AIO_NUM_WORK_QUEUES
; i
++) {
2609 aio_workq_init(&aio_anchor
.aio_async_workqs
[i
]);
2613 i
= sizeof( aio_workq_entry
);
2614 aio_workq_zonep
= zinit( i
, i
* aio_max_requests
, i
* aio_max_requests
, "aiowq" );
2616 _aio_create_worker_threads( aio_worker_threads
);
2622 * aio worker threads created here.
2624 __private_extern__
void
2625 _aio_create_worker_threads( int num
)
2629 /* create some worker threads to handle the async IO requests */
2630 for ( i
= 0; i
< num
; i
++ ) {
2633 if ( KERN_SUCCESS
!= kernel_thread_start((thread_continue_t
)aio_work_thread
, NULL
, &myThread
) ) {
2634 printf( "%s - failed to create a work thread \n", __FUNCTION__
);
2637 thread_deallocate(myThread
);
2642 } /* _aio_create_worker_threads */
2645 * Return the current activation utask
2650 return ((struct uthread
*)get_bsdthread_info(current_thread()))->uu_aio_task
;
2655 * In the case of an aiocb from a
2656 * 32-bit process we need to expand some longs and pointers to the correct
2657 * sizes in order to let downstream code always work on the same type of
2658 * aiocb (in our case that is a user_aiocb)
2661 do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2663 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2664 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2665 the_user_aiocbp
->aio_buf
= CAST_USER_ADDR_T(my_aiocbp
->aio_buf
);
2666 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2667 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2668 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2670 /* special case here. since we do not know if sigev_value is an */
2671 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2672 /* means if we send this info back to user space we need to remember */
2673 /* sigev_value was not expanded for the 32-bit case. */
2674 /* NOTE - this does NOT affect us since we don't support sigev_value */
2675 /* yet in the aio context. */
2677 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2678 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2679 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2680 my_aiocbp
->aio_sigevent
.sigev_value
.sival_int
;
2681 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2682 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_function
);
2683 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2684 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_attributes
);
2687 /* Similar for 64-bit user process, so that we don't need to satisfy
2688 * the alignment constraints of the original user64_aiocb
2691 do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2693 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2694 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2695 the_user_aiocbp
->aio_buf
= my_aiocbp
->aio_buf
;
2696 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2697 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2698 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2700 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2701 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2702 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2703 my_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
;
2704 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2705 my_aiocbp
->aio_sigevent
.sigev_notify_function
;
2706 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2707 my_aiocbp
->aio_sigevent
.sigev_notify_attributes
;