2 * Copyright (c) 2003-2016 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
62 #include <machine/limits.h>
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/waitq.h>
67 #include <kern/zalloc.h>
68 #include <kern/task.h>
69 #include <kern/sched_prim.h>
71 #include <vm/vm_map.h>
73 #include <libkern/OSAtomic.h>
75 #include <sys/kdebug.h>
76 #define AIO_work_queued 1
77 #define AIO_worker_wake 2
78 #define AIO_completion_sig 3
79 #define AIO_completion_cleanup_wait 4
80 #define AIO_completion_cleanup_wake 5
81 #define AIO_completion_suspend_wake 6
82 #define AIO_fsync_delay 7
84 #define AIO_cancel_async_workq 11
85 #define AIO_cancel_sync_workq 12
86 #define AIO_cancel_activeq 13
87 #define AIO_cancel_doneq 14
93 #define AIO_error_val 61
94 #define AIO_error_activeq 62
95 #define AIO_error_workq 63
97 #define AIO_return_val 71
98 #define AIO_return_activeq 72
99 #define AIO_return_workq 73
102 #define AIO_exit_sleep 91
103 #define AIO_close 100
104 #define AIO_close_sleep 101
105 #define AIO_suspend 110
106 #define AIO_suspend_sleep 111
107 #define AIO_worker_thread 120
111 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
115 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
116 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
117 * (proc.aio_activeq) when one of our worker threads start the IO.
118 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
119 * when the IO request completes. The request remains on aio_doneq until
120 * user process calls aio_return or the process exits, either way that is our
121 * trigger to release aio resources.
123 typedef struct aio_workq
{
124 TAILQ_HEAD(, aio_workq_entry
) aioq_entries
;
127 struct waitq aioq_waitq
;
130 #define AIO_NUM_WORK_QUEUES 1
131 struct aio_anchor_cb
{
132 volatile int32_t aio_inflight_count
; /* entries that have been taken from a workq */
133 volatile int32_t aio_done_count
; /* entries on all done queues (proc.aio_doneq) */
134 volatile int32_t aio_total_count
; /* total extant entries */
136 /* Hash table of queues here */
138 struct aio_workq aio_async_workqs
[AIO_NUM_WORK_QUEUES
];
140 typedef struct aio_anchor_cb aio_anchor_cb
;
142 struct aio_lio_context
{
147 typedef struct aio_lio_context aio_lio_context
;
151 * Notes on aio sleep / wake channels.
152 * We currently pick a couple fields within the proc structure that will allow
153 * us sleep channels that currently do not collide with any other kernel routines.
154 * At this time, for binary compatibility reasons, we cannot create new proc fields.
156 #define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
157 #define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
159 #define ASSERT_AIO_FROM_PROC(aiop, theproc) \
160 if ((aiop)->procp != (theproc)) { \
161 panic("AIO on a proc list that does not belong to that proc.\n"); \
167 static void aio_proc_lock(proc_t procp
);
168 static void aio_proc_lock_spin(proc_t procp
);
169 static void aio_proc_unlock(proc_t procp
);
170 static lck_mtx_t
* aio_proc_mutex(proc_t procp
);
171 static void aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
172 static void aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
);
173 static int aio_get_process_count(proc_t procp
);
174 static int aio_active_requests_for_process(proc_t procp
);
175 static int aio_proc_active_requests_for_file(proc_t procp
, int fd
);
176 static boolean_t
is_already_queued(proc_t procp
, user_addr_t aiocbp
);
177 static boolean_t
should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
);
179 static void aio_entry_lock(aio_workq_entry
*entryp
);
180 static void aio_entry_lock_spin(aio_workq_entry
*entryp
);
181 static aio_workq_t
aio_entry_workq(aio_workq_entry
*entryp
);
182 static lck_mtx_t
* aio_entry_mutex(__unused aio_workq_entry
*entryp
);
183 static void aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
184 static void aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
);
185 static void aio_entry_ref_locked(aio_workq_entry
*entryp
);
186 static void aio_entry_unref_locked(aio_workq_entry
*entryp
);
187 static void aio_entry_ref(aio_workq_entry
*entryp
);
188 static void aio_entry_unref(aio_workq_entry
*entryp
);
189 static void aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
,
190 int wait_for_completion
, boolean_t disable_notification
);
191 static int aio_entry_try_workq_remove(aio_workq_entry
*entryp
);
192 static boolean_t
aio_delay_fsync_request( aio_workq_entry
*entryp
);
193 static int aio_free_request(aio_workq_entry
*entryp
);
195 static void aio_workq_init(aio_workq_t wq
);
196 static void aio_workq_lock_spin(aio_workq_t wq
);
197 static void aio_workq_unlock(aio_workq_t wq
);
198 static lck_mtx_t
* aio_workq_mutex(aio_workq_t wq
);
200 static void aio_work_thread( void );
201 static aio_workq_entry
*aio_get_some_work( void );
203 static int aio_get_all_queues_count( void );
204 static int aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
);
205 static int aio_validate( aio_workq_entry
*entryp
);
206 static int aio_increment_total_count(void);
207 static int aio_decrement_total_count(void);
209 static int do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
, int wait_for_completion
, boolean_t disable_notification
);
210 static void do_aio_completion( aio_workq_entry
*entryp
);
211 static int do_aio_fsync( aio_workq_entry
*entryp
);
212 static int do_aio_read( aio_workq_entry
*entryp
);
213 static int do_aio_write( aio_workq_entry
*entryp
);
214 static void do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
215 static void do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
);
216 static int lio_create_entry(proc_t procp
,
219 aio_workq_entry
**entrypp
);
220 static aio_workq_entry
*aio_create_queue_entry(proc_t procp
,
224 static user_addr_t
*aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
);
225 static void free_lio_context(aio_lio_context
* context
);
226 static void aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
);
228 #define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
229 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
230 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
233 * EXTERNAL PROTOTYPES
236 /* in ...bsd/kern/sys_generic.c */
237 extern int dofileread(vfs_context_t ctx
, struct fileproc
*fp
,
238 user_addr_t bufp
, user_size_t nbyte
,
239 off_t offset
, int flags
, user_ssize_t
*retval
);
240 extern int dofilewrite(vfs_context_t ctx
, struct fileproc
*fp
,
241 user_addr_t bufp
, user_size_t nbyte
, off_t offset
,
242 int flags
, user_ssize_t
*retval
);
244 static uint32_t lio_contexts_alloced
= 0;
248 * aio external global variables.
250 extern int aio_max_requests
; /* AIO_MAX - configurable */
251 extern int aio_max_requests_per_process
; /* AIO_PROCESS_MAX - configurable */
252 extern int aio_worker_threads
; /* AIO_THREAD_COUNT - configurable */
256 * aio static variables.
258 static aio_anchor_cb aio_anchor
;
259 static lck_grp_t
*aio_proc_lock_grp
;
260 static lck_grp_t
*aio_entry_lock_grp
;
261 static lck_grp_t
*aio_queue_lock_grp
;
262 static lck_attr_t
*aio_lock_attr
;
263 static lck_grp_attr_t
*aio_lock_grp_attr
;
264 static struct zone
*aio_workq_zonep
;
265 static lck_mtx_t aio_entry_mtx
;
266 static lck_mtx_t aio_proc_mtx
;
269 aio_entry_lock(__unused aio_workq_entry
*entryp
)
271 lck_mtx_lock(&aio_entry_mtx
);
275 aio_entry_lock_spin(__unused aio_workq_entry
*entryp
)
277 lck_mtx_lock_spin(&aio_entry_mtx
);
281 aio_entry_unlock(__unused aio_workq_entry
*entryp
)
283 lck_mtx_unlock(&aio_entry_mtx
);
288 aio_entry_workq(__unused aio_workq_entry
*entryp
)
290 return &aio_anchor
.aio_async_workqs
[0];
294 aio_entry_mutex(__unused aio_workq_entry
*entryp
)
296 return &aio_entry_mtx
;
300 aio_workq_init(aio_workq_t wq
)
302 TAILQ_INIT(&wq
->aioq_entries
);
304 lck_mtx_init(&wq
->aioq_mtx
, aio_queue_lock_grp
, aio_lock_attr
);
305 waitq_init(&wq
->aioq_waitq
, SYNC_POLICY_FIFO
);
310 * Can be passed a queue which is locked spin.
313 aio_workq_remove_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
315 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
317 if (entryp
->aio_workq_link
.tqe_prev
== NULL
) {
318 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
321 TAILQ_REMOVE(&queue
->aioq_entries
, entryp
, aio_workq_link
);
323 entryp
->aio_workq_link
.tqe_prev
= NULL
; /* Not on a workq */
325 if (queue
->aioq_count
< 0) {
326 panic("Negative count on a queue.\n");
331 aio_workq_add_entry_locked(aio_workq_t queue
, aio_workq_entry
*entryp
)
333 ASSERT_AIO_WORKQ_LOCK_OWNED(queue
);
335 TAILQ_INSERT_TAIL(&queue
->aioq_entries
, entryp
, aio_workq_link
);
336 if (queue
->aioq_count
< 0) {
337 panic("Negative count on a queue.\n");
343 aio_proc_lock(proc_t procp
)
345 lck_mtx_lock(aio_proc_mutex(procp
));
349 aio_proc_lock_spin(proc_t procp
)
351 lck_mtx_lock_spin(aio_proc_mutex(procp
));
355 aio_proc_move_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
357 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
359 TAILQ_REMOVE(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
360 TAILQ_INSERT_TAIL( &procp
->p_aio_doneq
, entryp
, aio_proc_link
);
361 procp
->p_aio_active_count
--;
362 OSIncrementAtomic(&aio_anchor
.aio_done_count
);
366 aio_proc_remove_done_locked(proc_t procp
, aio_workq_entry
*entryp
)
368 TAILQ_REMOVE(&procp
->p_aio_doneq
, entryp
, aio_proc_link
);
369 OSDecrementAtomic(&aio_anchor
.aio_done_count
);
370 aio_decrement_total_count();
371 procp
->p_aio_total_count
--;
375 aio_proc_unlock(proc_t procp
)
377 lck_mtx_unlock(aio_proc_mutex(procp
));
381 aio_proc_mutex(proc_t procp
)
383 return &procp
->p_mlock
;
387 aio_entry_ref_locked(aio_workq_entry
*entryp
)
389 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
391 if (entryp
->aio_refcount
< 0) {
392 panic("AIO workq entry with a negative refcount.\n");
394 entryp
->aio_refcount
++;
398 /* Return 1 if you've freed it */
400 aio_entry_unref_locked(aio_workq_entry
*entryp
)
402 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp
);
404 entryp
->aio_refcount
--;
405 if (entryp
->aio_refcount
< 0) {
406 panic("AIO workq entry with a negative refcount.\n");
411 aio_entry_ref(aio_workq_entry
*entryp
)
413 aio_entry_lock_spin(entryp
);
414 aio_entry_ref_locked(entryp
);
415 aio_entry_unlock(entryp
);
418 aio_entry_unref(aio_workq_entry
*entryp
)
420 aio_entry_lock_spin(entryp
);
421 aio_entry_unref_locked(entryp
);
423 if ((entryp
->aio_refcount
== 0) && ((entryp
->flags
& AIO_DO_FREE
) != 0)) {
424 aio_entry_unlock(entryp
);
425 aio_free_request(entryp
);
427 aio_entry_unlock(entryp
);
434 aio_entry_update_for_cancel(aio_workq_entry
*entryp
, boolean_t cancelled
, int wait_for_completion
, boolean_t disable_notification
)
436 aio_entry_lock_spin(entryp
);
439 aio_entry_ref_locked(entryp
);
440 entryp
->errorval
= ECANCELED
;
441 entryp
->returnval
= -1;
444 if (wait_for_completion
) {
445 entryp
->flags
|= wait_for_completion
; /* flag for special completion processing */
448 if (disable_notification
) {
449 entryp
->flags
|= AIO_DISABLE
; /* Don't want a signal */
452 aio_entry_unlock(entryp
);
456 aio_entry_try_workq_remove(aio_workq_entry
*entryp
)
458 /* Can only be cancelled if it's still on a work queue */
459 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
462 /* Will have to check again under the lock */
463 queue
= aio_entry_workq(entryp
);
464 aio_workq_lock_spin(queue
);
465 if (entryp
->aio_workq_link
.tqe_prev
!= NULL
) {
466 aio_workq_remove_entry_locked(queue
, entryp
);
467 aio_workq_unlock(queue
);
470 aio_workq_unlock(queue
);
478 aio_workq_lock_spin(aio_workq_t wq
)
480 lck_mtx_lock_spin(aio_workq_mutex(wq
));
484 aio_workq_unlock(aio_workq_t wq
)
486 lck_mtx_unlock(aio_workq_mutex(wq
));
490 aio_workq_mutex(aio_workq_t wq
)
492 return &wq
->aioq_mtx
;
496 * aio_cancel - attempt to cancel one or more async IO requests currently
497 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
498 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
499 * is NULL then all outstanding async IO request for the given file
500 * descriptor are cancelled (if possible).
503 aio_cancel(proc_t p
, struct aio_cancel_args
*uap
, int *retval
)
505 struct user_aiocb my_aiocb
;
508 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_START
,
509 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
511 /* quick check to see if there are any async IO requests queued up */
512 if (aio_get_all_queues_count() < 1) {
514 *retval
= AIO_ALLDONE
;
519 if (uap
->aiocbp
!= USER_ADDR_NULL
) {
520 if (proc_is64bit(p
)) {
521 struct user64_aiocb aiocb64
;
523 result
= copyin( uap
->aiocbp
, &aiocb64
, sizeof(aiocb64
));
525 do_munge_aiocb_user64_to_user(&aiocb64
, &my_aiocb
);
528 struct user32_aiocb aiocb32
;
530 result
= copyin( uap
->aiocbp
, &aiocb32
, sizeof(aiocb32
));
532 do_munge_aiocb_user32_to_user( &aiocb32
, &my_aiocb
);
541 /* NOTE - POSIX standard says a mismatch between the file */
542 /* descriptor passed in and the file descriptor embedded in */
543 /* the aiocb causes unspecified results. We return EBADF in */
544 /* that situation. */
545 if (uap
->fd
!= my_aiocb
.aio_fildes
) {
552 result
= do_aio_cancel_locked( p
, uap
->fd
, uap
->aiocbp
, 0, FALSE
);
553 ASSERT_AIO_PROC_LOCK_OWNED(p
);
565 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_END
,
566 (int)p
, (int)uap
->aiocbp
, result
, 0, 0 );
573 * _aio_close - internal function used to clean up async IO requests for
574 * a file descriptor that is closing.
577 __private_extern__
void
578 _aio_close(proc_t p
, int fd
)
582 /* quick check to see if there are any async IO requests queued up */
583 if (aio_get_all_queues_count() < 1) {
587 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_START
,
588 (int)p
, fd
, 0, 0, 0 );
590 /* cancel all async IO requests on our todo queues for this file descriptor */
592 error
= do_aio_cancel_locked( p
, fd
, 0, AIO_CLOSE_WAIT
, FALSE
);
593 ASSERT_AIO_PROC_LOCK_OWNED(p
);
594 if (error
== AIO_NOTCANCELED
) {
596 * AIO_NOTCANCELED is returned when we find an aio request for this process
597 * and file descriptor on the active async IO queue. Active requests cannot
598 * be cancelled so we must wait for them to complete. We will get a special
599 * wake up call on our channel used to sleep for ALL active requests to
600 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
601 * when we must wait for all active aio requests.
604 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_close_sleep
)) | DBG_FUNC_NONE
,
605 (int)p
, fd
, 0, 0, 0 );
607 while (aio_proc_active_requests_for_file(p
, fd
) > 0) {
608 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
, "aio_close", 0 );
614 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_END
,
615 (int)p
, fd
, 0, 0, 0 );
622 * aio_error - return the error status associated with the async IO
623 * request referred to by uap->aiocbp. The error status is the errno
624 * value that would be set by the corresponding IO request (read, wrtie,
625 * fdatasync, or sync).
628 aio_error(proc_t p
, struct aio_error_args
*uap
, int *retval
)
630 aio_workq_entry
*entryp
;
633 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_START
,
634 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
636 /* see if there are any aios to check */
637 if (aio_get_all_queues_count() < 1) {
643 /* look for a match on our queue of async IO requests that have completed */
644 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
645 if (entryp
->uaiocbp
== uap
->aiocbp
) {
646 ASSERT_AIO_FROM_PROC(entryp
, p
);
648 aio_entry_lock_spin(entryp
);
649 *retval
= entryp
->errorval
;
651 aio_entry_unlock(entryp
);
652 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_val
)) | DBG_FUNC_NONE
,
653 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
658 /* look for a match on our queue of active async IO requests */
659 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
660 if (entryp
->uaiocbp
== uap
->aiocbp
) {
661 ASSERT_AIO_FROM_PROC(entryp
, p
);
662 *retval
= EINPROGRESS
;
664 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_activeq
)) | DBG_FUNC_NONE
,
665 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
673 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_END
,
674 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
682 * aio_fsync - asynchronously force all IO operations associated
683 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
684 * queued at the time of the call to the synchronized completion state.
685 * NOTE - we do not support op O_DSYNC at this point since we do not support the
689 aio_fsync(proc_t p
, struct aio_fsync_args
*uap
, int *retval
)
694 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_START
,
695 (int)p
, (int)uap
->aiocbp
, uap
->op
, 0, 0 );
698 /* 0 := O_SYNC for binary backward compatibility with Panther */
699 if (uap
->op
== O_SYNC
|| uap
->op
== 0) {
700 fsync_kind
= AIO_FSYNC
;
701 } else if (uap
->op
== O_DSYNC
) {
702 fsync_kind
= AIO_DSYNC
;
709 error
= aio_queue_async_request( p
, uap
->aiocbp
, fsync_kind
);
715 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_END
,
716 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
722 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
723 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
724 * (uap->aiocbp->aio_buf).
727 aio_read(proc_t p
, struct aio_read_args
*uap
, int *retval
)
731 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_START
,
732 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
736 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_READ
);
741 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_END
,
742 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
749 * aio_return - return the return status associated with the async IO
750 * request referred to by uap->aiocbp. The return status is the value
751 * that would be returned by corresponding IO request (read, write,
752 * fdatasync, or sync). This is where we release kernel resources
753 * held for async IO call associated with the given aiocb pointer.
756 aio_return(proc_t p
, struct aio_return_args
*uap
, user_ssize_t
*retval
)
758 aio_workq_entry
*entryp
;
760 boolean_t proc_lock_held
= FALSE
;
762 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_START
,
763 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
765 /* See if there are any entries to check */
766 if (aio_get_all_queues_count() < 1) {
772 proc_lock_held
= TRUE
;
775 /* look for a match on our queue of async IO requests that have completed */
776 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
777 ASSERT_AIO_FROM_PROC(entryp
, p
);
778 if (entryp
->uaiocbp
== uap
->aiocbp
) {
779 /* Done and valid for aio_return(), pull it off the list */
780 aio_proc_remove_done_locked(p
, entryp
);
782 /* Drop the proc lock, but keep the entry locked */
783 aio_entry_lock(entryp
);
785 proc_lock_held
= FALSE
;
787 *retval
= entryp
->returnval
;
790 /* No references and off all lists, safe to free */
791 if (entryp
->aio_refcount
== 0) {
792 aio_entry_unlock(entryp
);
793 aio_free_request(entryp
);
795 /* Whoever has the refcount will have to free it */
796 entryp
->flags
|= AIO_DO_FREE
;
797 aio_entry_unlock(entryp
);
801 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_val
)) | DBG_FUNC_NONE
,
802 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
807 /* look for a match on our queue of active async IO requests */
808 TAILQ_FOREACH( entryp
, &p
->p_aio_activeq
, aio_proc_link
) {
809 ASSERT_AIO_FROM_PROC(entryp
, p
);
810 if (entryp
->uaiocbp
== uap
->aiocbp
) {
812 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_activeq
)) | DBG_FUNC_NONE
,
813 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
821 if (proc_lock_held
) {
824 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_END
,
825 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
832 * _aio_exec - internal function used to clean up async IO requests for
833 * a process that is going away due to exec(). We cancel any async IOs
834 * we can and wait for those already active. We also disable signaling
835 * for cancelled or active aio requests that complete.
836 * This routine MAY block!
838 __private_extern__
void
841 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_START
,
842 (int)p
, 0, 0, 0, 0 );
846 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_END
,
847 (int)p
, 0, 0, 0, 0 );
854 * _aio_exit - internal function used to clean up async IO requests for
855 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
856 * we can and wait for those already active. We also disable signaling
857 * for cancelled or active aio requests that complete. This routine MAY block!
859 __private_extern__
void
863 aio_workq_entry
*entryp
;
866 /* quick check to see if there are any async IO requests queued up */
867 if (aio_get_all_queues_count() < 1) {
871 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_START
,
872 (int)p
, 0, 0, 0, 0 );
877 * cancel async IO requests on the todo work queue and wait for those
878 * already active to complete.
880 error
= do_aio_cancel_locked( p
, 0, 0, AIO_EXIT_WAIT
, TRUE
);
881 ASSERT_AIO_PROC_LOCK_OWNED(p
);
882 if (error
== AIO_NOTCANCELED
) {
884 * AIO_NOTCANCELED is returned when we find an aio request for this process
885 * on the active async IO queue. Active requests cannot be cancelled so we
886 * must wait for them to complete. We will get a special wake up call on
887 * our channel used to sleep for ALL active requests to complete. This sleep
888 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
889 * active aio requests.
892 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit_sleep
)) | DBG_FUNC_NONE
,
893 (int)p
, 0, 0, 0, 0 );
895 while (p
->p_aio_active_count
!= 0) {
896 msleep(&p
->AIO_CLEANUP_SLEEP_CHAN
, aio_proc_mutex(p
), PRIBIO
, "aio_exit", 0 );
900 if (p
->p_aio_active_count
!= 0) {
901 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p
->p_aio_active_count
);
904 /* release all aio resources used by this process */
905 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
906 while (entryp
!= NULL
) {
907 ASSERT_AIO_FROM_PROC(entryp
, p
);
908 aio_workq_entry
*next_entryp
;
910 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
911 aio_proc_remove_done_locked(p
, entryp
);
913 /* we cannot free requests that are still completing */
914 aio_entry_lock_spin(entryp
);
915 if (entryp
->aio_refcount
== 0) {
917 aio_entry_unlock(entryp
);
918 aio_free_request(entryp
);
920 /* need to start over since aio_doneq may have been */
921 /* changed while we were away. */
923 entryp
= TAILQ_FIRST( &p
->p_aio_doneq
);
926 /* whoever has the reference will have to do the free */
927 entryp
->flags
|= AIO_DO_FREE
;
930 aio_entry_unlock(entryp
);
931 entryp
= next_entryp
;
936 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_END
,
937 (int)p
, 0, 0, 0, 0 );
943 should_cancel(aio_workq_entry
*entryp
, user_addr_t aiocbp
, int fd
)
945 if ((aiocbp
== USER_ADDR_NULL
&& fd
== 0) ||
946 (aiocbp
!= USER_ADDR_NULL
&& entryp
->uaiocbp
== aiocbp
) ||
947 (aiocbp
== USER_ADDR_NULL
&& fd
== entryp
->aiocb
.aio_fildes
)) {
955 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
956 * aio_cancel, close, and at exit.
957 * There are three modes of operation: 1) cancel all async IOs for a process -
958 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
959 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
961 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
962 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
963 * target async IO requests, and AIO_ALLDONE if all target async IO requests
964 * were already complete.
965 * WARNING - do not deference aiocbp in this routine, it may point to user
966 * land data that has not been copied in (when called from aio_cancel() )
968 * Called with proc locked, and returns the same way.
971 do_aio_cancel_locked(proc_t p
, int fd
, user_addr_t aiocbp
,
972 int wait_for_completion
, boolean_t disable_notification
)
974 ASSERT_AIO_PROC_LOCK_OWNED(p
);
976 aio_workq_entry
*entryp
;
981 /* look for a match on our queue of async todo work. */
982 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
983 while (entryp
!= NULL
) {
984 ASSERT_AIO_FROM_PROC(entryp
, p
);
985 aio_workq_entry
*next_entryp
;
987 next_entryp
= TAILQ_NEXT( entryp
, aio_proc_link
);
988 if (!should_cancel(entryp
, aiocbp
, fd
)) {
989 entryp
= next_entryp
;
993 /* Can only be cancelled if it's still on a work queue */
994 if (aio_entry_try_workq_remove(entryp
) != 0) {
995 /* Have removed from workq. Update entry state and take a ref */
996 aio_entry_update_for_cancel(entryp
, TRUE
, 0, disable_notification
);
998 /* Put on the proc done queue and update counts, then unlock the proc */
999 aio_proc_move_done_locked(p
, entryp
);
1002 /* Now it's officially cancelled. Do the completion */
1003 result
= AIO_CANCELED
;
1004 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_async_workq
)) | DBG_FUNC_NONE
,
1005 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1006 do_aio_completion(entryp
);
1008 /* This will free if the aio_return() has already happened ... */
1009 aio_entry_unref(entryp
);
1012 if (aiocbp
!= USER_ADDR_NULL
) {
1017 * Restart from the head of the proc active queue since it
1018 * may have been changed while we were away doing completion
1021 * Note that if we found an uncancellable AIO before, we will
1022 * either find it again or discover that it's been completed,
1023 * so resetting the result will not cause us to return success
1024 * despite outstanding AIOs.
1026 entryp
= TAILQ_FIRST(&p
->p_aio_activeq
);
1027 result
= -1; /* As if beginning anew */
1030 * It's been taken off the active queue already, i.e. is in flight.
1031 * All we can do is ask for notification.
1033 result
= AIO_NOTCANCELED
;
1035 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_activeq
)) | DBG_FUNC_NONE
,
1036 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1038 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1039 aio_entry_update_for_cancel(entryp
, FALSE
, wait_for_completion
, disable_notification
);
1041 if (aiocbp
!= USER_ADDR_NULL
) {
1044 entryp
= next_entryp
;
1049 * if we didn't find any matches on the todo or active queues then look for a
1050 * match on our queue of async IO requests that have completed and if found
1051 * return AIO_ALLDONE result.
1053 * Proc AIO lock is still held.
1056 TAILQ_FOREACH(entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1057 ASSERT_AIO_FROM_PROC(entryp
, p
);
1058 if (should_cancel(entryp
, aiocbp
, fd
)) {
1059 result
= AIO_ALLDONE
;
1060 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_doneq
)) | DBG_FUNC_NONE
,
1061 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
1063 if (aiocbp
!= USER_ADDR_NULL
) {
1072 /* do_aio_cancel_locked */
1076 * aio_suspend - suspend the calling thread until at least one of the async
1077 * IO operations referenced by uap->aiocblist has completed, until a signal
1078 * interrupts the function, or uap->timeoutp time interval (optional) has
1080 * Returns 0 if one or more async IOs have completed else -1 and errno is
1081 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1085 aio_suspend(proc_t p
, struct aio_suspend_args
*uap
, int *retval
)
1087 __pthread_testcancel(1);
1088 return aio_suspend_nocancel(p
, (struct aio_suspend_nocancel_args
*)uap
, retval
);
1093 aio_suspend_nocancel(proc_t p
, struct aio_suspend_nocancel_args
*uap
, int *retval
)
1098 struct user_timespec ts
;
1099 aio_workq_entry
*entryp
;
1100 user_addr_t
*aiocbpp
;
1102 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_START
,
1103 (int)p
, uap
->nent
, 0, 0, 0 );
1109 count
= aio_get_all_queues_count();
1112 goto ExitThisRoutine
;
1115 if (uap
->nent
< 1 || uap
->nent
> aio_max_requests_per_process
) {
1117 goto ExitThisRoutine
;
1120 if (uap
->timeoutp
!= USER_ADDR_NULL
) {
1121 if (proc_is64bit(p
)) {
1122 struct user64_timespec temp
;
1123 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
));
1125 ts
.tv_sec
= temp
.tv_sec
;
1126 ts
.tv_nsec
= temp
.tv_nsec
;
1129 struct user32_timespec temp
;
1130 error
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
));
1132 ts
.tv_sec
= temp
.tv_sec
;
1133 ts
.tv_nsec
= temp
.tv_nsec
;
1138 goto ExitThisRoutine
;
1141 if (ts
.tv_sec
< 0 || ts
.tv_nsec
< 0 || ts
.tv_nsec
>= 1000000000) {
1143 goto ExitThisRoutine
;
1146 nanoseconds_to_absolutetime((uint64_t)ts
.tv_sec
* NSEC_PER_SEC
+ ts
.tv_nsec
,
1148 clock_absolutetime_interval_to_deadline( abstime
, &abstime
);
1151 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1152 if (aiocbpp
== NULL
) {
1154 goto ExitThisRoutine
;
1157 /* check list of aio requests to see if any have completed */
1158 check_for_our_aiocbp
:
1159 aio_proc_lock_spin(p
);
1160 for (i
= 0; i
< uap
->nent
; i
++) {
1163 /* NULL elements are legal so check for 'em */
1164 aiocbp
= *(aiocbpp
+ i
);
1165 if (aiocbp
== USER_ADDR_NULL
) {
1169 /* return immediately if any aio request in the list is done */
1170 TAILQ_FOREACH( entryp
, &p
->p_aio_doneq
, aio_proc_link
) {
1171 ASSERT_AIO_FROM_PROC(entryp
, p
);
1172 if (entryp
->uaiocbp
== aiocbp
) {
1176 goto ExitThisRoutine
;
1179 } /* for ( ; i < uap->nent; ) */
1181 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend_sleep
)) | DBG_FUNC_NONE
,
1182 (int)p
, uap
->nent
, 0, 0, 0 );
1185 * wait for an async IO to complete or a signal fires or timeout expires.
1186 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1187 * interrupts us. If an async IO completes before a signal fires or our
1188 * timeout expires, we get a wakeup call from aio_work_thread().
1191 error
= msleep1(&p
->AIO_SUSPEND_SLEEP_CHAN
, aio_proc_mutex(p
), PCATCH
| PWAIT
| PDROP
, "aio_suspend", abstime
); /* XXX better priority? */
1194 * got our wakeup call from aio_work_thread().
1195 * Since we can get a wakeup on this channel from another thread in the
1196 * same process we head back up to make sure this is for the correct aiocbp.
1197 * If it is the correct aiocbp we will return from where we do the check
1198 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1199 * else we will fall out and just sleep again.
1201 goto check_for_our_aiocbp
;
1202 } else if (error
== EWOULDBLOCK
) {
1203 /* our timeout expired */
1206 /* we were interrupted */
1211 if (aiocbpp
!= NULL
) {
1212 FREE( aiocbpp
, M_TEMP
);
1215 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_END
,
1216 (int)p
, uap
->nent
, error
, 0, 0 );
1222 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1223 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1224 * (uap->aiocbp->aio_buf).
1228 aio_write(proc_t p
, struct aio_write_args
*uap
, int *retval
)
1234 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_START
,
1235 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
1237 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_WRITE
);
1242 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_END
,
1243 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
1249 static user_addr_t
*
1250 aio_copy_in_list(proc_t procp
, user_addr_t aiocblist
, int nent
)
1252 user_addr_t
*aiocbpp
;
1255 /* we reserve enough space for largest possible pointer size */
1256 MALLOC( aiocbpp
, user_addr_t
*, (nent
* sizeof(user_addr_t
)), M_TEMP
, M_WAITOK
);
1257 if (aiocbpp
== NULL
) {
1261 /* copyin our aiocb pointers from list */
1262 result
= copyin( aiocblist
, aiocbpp
,
1263 proc_is64bit(procp
) ? (nent
* sizeof(user64_addr_t
))
1264 : (nent
* sizeof(user32_addr_t
)));
1266 FREE( aiocbpp
, M_TEMP
);
1272 * We depend on a list of user_addr_t's so we need to
1273 * munge and expand when these pointers came from a
1276 if (!proc_is64bit(procp
)) {
1277 /* copy from last to first to deal with overlap */
1278 user32_addr_t
*my_ptrp
= ((user32_addr_t
*)aiocbpp
) + (nent
- 1);
1279 user_addr_t
*my_addrp
= aiocbpp
+ (nent
- 1);
1281 for (i
= 0; i
< nent
; i
++, my_ptrp
--, my_addrp
--) {
1282 *my_addrp
= (user_addr_t
) (*my_ptrp
);
1292 aio_copy_in_sigev(proc_t procp
, user_addr_t sigp
, struct user_sigevent
*sigev
)
1296 if (sigp
== USER_ADDR_NULL
) {
1301 * We need to munge aio_sigevent since it contains pointers.
1302 * Since we do not know if sigev_value is an int or a ptr we do
1303 * NOT cast the ptr to a user_addr_t. This means if we send
1304 * this info back to user space we need to remember sigev_value
1305 * was not expanded for the 32-bit case.
1307 * Notes: This does NOT affect us since we don't support
1308 * sigev_value yet in the aio context.
1310 if (proc_is64bit(procp
)) {
1311 struct user64_sigevent sigevent64
;
1313 result
= copyin( sigp
, &sigevent64
, sizeof(sigevent64
));
1315 sigev
->sigev_notify
= sigevent64
.sigev_notify
;
1316 sigev
->sigev_signo
= sigevent64
.sigev_signo
;
1317 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent64
.sigev_value
.size_equivalent
.sival_int
;
1318 sigev
->sigev_notify_function
= sigevent64
.sigev_notify_function
;
1319 sigev
->sigev_notify_attributes
= sigevent64
.sigev_notify_attributes
;
1322 struct user32_sigevent sigevent32
;
1324 result
= copyin( sigp
, &sigevent32
, sizeof(sigevent32
));
1326 sigev
->sigev_notify
= sigevent32
.sigev_notify
;
1327 sigev
->sigev_signo
= sigevent32
.sigev_signo
;
1328 sigev
->sigev_value
.size_equivalent
.sival_int
= sigevent32
.sigev_value
.sival_int
;
1329 sigev
->sigev_notify_function
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_function
);
1330 sigev
->sigev_notify_attributes
= CAST_USER_ADDR_T(sigevent32
.sigev_notify_attributes
);
1343 * validate user_sigevent. at this point we only support
1344 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
1345 * sigev_value, sigev_notify_function, and sigev_notify_attributes
1346 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
1347 * with no [RTS] (RalTime Signal) option group support.
1350 aio_sigev_validate( const struct user_sigevent
*sigev
)
1352 switch (sigev
->sigev_notify
) {
1357 /* make sure we have a valid signal number */
1358 signum
= sigev
->sigev_signo
;
1359 if (signum
<= 0 || signum
>= NSIG
||
1360 signum
== SIGKILL
|| signum
== SIGSTOP
) {
1370 /* Unsupported [RTS] */
1383 * Queue up the entry on the aio asynchronous work queue in priority order
1384 * based on the relative priority of the request. We calculate the relative
1385 * priority using the nice value of the caller and the value
1387 * Parameters: procp Process queueing the I/O
1388 * entryp The work queue entry being queued
1390 * Returns: (void) No failure modes
1392 * Notes: This function is used for both lio_listio and aio
1394 * XXX: At some point, we may have to consider thread priority
1395 * rather than process priority, but we don't maintain the
1396 * adjusted priority for threads the POSIX way.
1399 * Called with proc locked.
1402 aio_enqueue_work( proc_t procp
, aio_workq_entry
*entryp
, int proc_locked
)
1405 aio_workq_entry
*my_entryp
; /* used for insertion sort */
1407 aio_workq_t queue
= aio_entry_workq(entryp
);
1409 if (proc_locked
== 0) {
1410 aio_proc_lock(procp
);
1413 ASSERT_AIO_PROC_LOCK_OWNED(procp
);
1415 /* Onto proc queue */
1416 TAILQ_INSERT_TAIL(&procp
->p_aio_activeq
, entryp
, aio_proc_link
);
1417 procp
->p_aio_active_count
++;
1418 procp
->p_aio_total_count
++;
1420 /* And work queue */
1421 aio_workq_lock_spin(queue
);
1422 aio_workq_add_entry_locked(queue
, entryp
);
1423 waitq_wakeup64_one(&queue
->aioq_waitq
, CAST_EVENT64_T(queue
),
1424 THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
1425 aio_workq_unlock(queue
);
1427 if (proc_locked
== 0) {
1428 aio_proc_unlock(procp
);
1435 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1436 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1437 * which is [0..39], with 0 not being used. In nice values, the
1438 * lower the nice value, the higher the priority.
1439 * (3) The normalized scheduling prioritiy is the highest nice value
1440 * minus the current nice value. In I/O scheduling priority, the
1441 * higher the value the lower the priority, so it is the inverse
1442 * of the nice value (the higher the number, the higher the I/O
1444 * (4) From the normalized scheduling priority, we subtract the
1445 * request priority to get the request priority value number;
1446 * this means that requests are only capable of depressing their
1447 * priority relative to other requests,
1449 entryp
->priority
= (((2 * NZERO
) - 1) - procp
->p_nice
);
1451 /* only premit depressing the priority */
1452 if (entryp
->aiocb
.aio_reqprio
< 0) {
1453 entryp
->aiocb
.aio_reqprio
= 0;
1455 if (entryp
->aiocb
.aio_reqprio
> 0) {
1456 entryp
->priority
-= entryp
->aiocb
.aio_reqprio
;
1457 if (entryp
->priority
< 0) {
1458 entryp
->priority
= 0;
1462 /* Insertion sort the entry; lowest ->priority to highest */
1463 TAILQ_FOREACH(my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1464 if (entryp
->priority
<= my_entryp
->priority
) {
1465 TAILQ_INSERT_BEFORE(my_entryp
, entryp
, aio_workq_link
);
1469 if (my_entryp
== NULL
) {
1470 TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1477 * lio_listio - initiate a list of IO requests. We process the list of
1478 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1479 * (mode == LIO_NOWAIT).
1481 * The caller gets error and return status for each aiocb in the list
1482 * via aio_error and aio_return. We must keep completed requests until
1483 * released by the aio_return call.
1486 lio_listio(proc_t p
, struct lio_listio_args
*uap
, int *retval
)
1492 aio_workq_entry
**entryp_listp
;
1493 user_addr_t
*aiocbpp
;
1494 struct user_sigevent aiosigev
;
1495 aio_lio_context
*lio_context
;
1496 boolean_t free_context
= FALSE
;
1497 uint32_t *paio_offset
;
1498 uint32_t *paio_nbytes
;
1500 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_START
,
1501 (int)p
, uap
->nent
, uap
->mode
, 0, 0 );
1503 entryp_listp
= NULL
;
1508 if (!(uap
->mode
== LIO_NOWAIT
|| uap
->mode
== LIO_WAIT
)) {
1509 call_result
= EINVAL
;
1513 if (uap
->nent
< 1 || uap
->nent
> AIO_LISTIO_MAX
) {
1514 call_result
= EINVAL
;
1519 * allocate a list of aio_workq_entry pointers that we will use
1520 * to queue up all our requests at once while holding our lock.
1522 MALLOC( entryp_listp
, void *, (uap
->nent
* sizeof(aio_workq_entry
*)), M_TEMP
, M_WAITOK
);
1523 if (entryp_listp
== NULL
) {
1524 call_result
= EAGAIN
;
1528 MALLOC( lio_context
, aio_lio_context
*, sizeof(aio_lio_context
), M_TEMP
, M_WAITOK
);
1529 if (lio_context
== NULL
) {
1530 call_result
= EAGAIN
;
1535 OSIncrementAtomic(&lio_contexts_alloced
);
1538 free_context
= TRUE
;
1539 bzero(lio_context
, sizeof(aio_lio_context
));
1541 aiocbpp
= aio_copy_in_list(p
, uap
->aiocblist
, uap
->nent
);
1542 if (aiocbpp
== NULL
) {
1543 call_result
= EAGAIN
;
1548 * Use sigevent passed in to lio_listio for each of our calls, but
1549 * only do completion notification after the last request completes.
1551 bzero(&aiosigev
, sizeof(aiosigev
));
1552 /* Only copy in an sigev if the user supplied one */
1553 if (uap
->sigp
!= USER_ADDR_NULL
) {
1554 call_result
= aio_copy_in_sigev(p
, uap
->sigp
, &aiosigev
);
1558 call_result
= aio_sigev_validate(&aiosigev
);
1564 /* process list of aio requests */
1565 free_context
= FALSE
;
1566 lio_context
->io_issued
= uap
->nent
;
1567 lio_context
->io_waiter
= uap
->mode
== LIO_WAIT
? 1 : 0; /* Should it be freed by last AIO */
1568 for (i
= 0; i
< uap
->nent
; i
++) {
1569 user_addr_t my_aiocbp
;
1570 aio_workq_entry
*entryp
;
1572 *(entryp_listp
+ i
) = NULL
;
1573 my_aiocbp
= *(aiocbpp
+ i
);
1575 /* NULL elements are legal so check for 'em */
1576 if (my_aiocbp
== USER_ADDR_NULL
) {
1577 aio_proc_lock_spin(p
);
1578 lio_context
->io_issued
--;
1584 * We use lio_context to mark IO requests for delayed completion
1585 * processing which means we wait until all IO requests in the
1586 * group have completed before we either return to the caller
1587 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1589 * We use the address of the lio_context for this, since it is
1590 * unique in the address space.
1592 result
= lio_create_entry( p
, my_aiocbp
, lio_context
, (entryp_listp
+ i
));
1593 if (result
!= 0 && call_result
== -1) {
1594 call_result
= result
;
1597 /* NULL elements are legal so check for 'em */
1598 entryp
= *(entryp_listp
+ i
);
1599 if (entryp
== NULL
) {
1600 aio_proc_lock_spin(p
);
1601 lio_context
->io_issued
--;
1606 if (uap
->mode
== LIO_NOWAIT
) {
1607 /* Set signal hander, if any */
1608 entryp
->aiocb
.aio_sigevent
= aiosigev
;
1610 /* flag that this thread blocks pending completion */
1611 entryp
->flags
|= AIO_LIO_NOTIFY
;
1614 /* check our aio limits to throttle bad or rude user land behavior */
1615 old_count
= aio_increment_total_count();
1617 aio_proc_lock_spin(p
);
1618 if (old_count
>= aio_max_requests
||
1619 aio_get_process_count( entryp
->procp
) >= aio_max_requests_per_process
||
1620 is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1621 lio_context
->io_issued
--;
1624 aio_decrement_total_count();
1626 if (call_result
== -1) {
1627 call_result
= EAGAIN
;
1629 aio_free_request(entryp
);
1630 entryp_listp
[i
] = NULL
;
1634 lck_mtx_convert_spin(aio_proc_mutex(p
));
1635 aio_enqueue_work(p
, entryp
, 1);
1638 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_START
,
1639 (int)p
, (int)entryp
->uaiocbp
, entryp
->flags
, entryp
->aiocb
.aio_fildes
, 0 );
1640 paio_offset
= (uint32_t*) &entryp
->aiocb
.aio_offset
;
1641 paio_nbytes
= (uint32_t*) &entryp
->aiocb
.aio_nbytes
;
1642 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_END
,
1643 paio_offset
[0], (sizeof(entryp
->aiocb
.aio_offset
) == sizeof(uint64_t) ? paio_offset
[1] : 0),
1644 paio_nbytes
[0], (sizeof(entryp
->aiocb
.aio_nbytes
) == sizeof(uint64_t) ? paio_nbytes
[1] : 0),
1648 aio_proc_lock_spin(p
);
1649 switch (uap
->mode
) {
1651 while (lio_context
->io_completed
< lio_context
->io_issued
) {
1652 result
= msleep(lio_context
, aio_proc_mutex(p
), PCATCH
| PRIBIO
| PSPIN
, "lio_listio", 0);
1654 /* If we were interrupted, fail out (even if all finished) */
1656 call_result
= EINTR
;
1657 lio_context
->io_waiter
= 0;
1662 /* If all IOs have finished must free it */
1663 if (lio_context
->io_completed
== lio_context
->io_issued
) {
1664 free_context
= TRUE
;
1670 /* If no IOs were issued must free it (rdar://problem/45717887) */
1671 if (lio_context
->io_issued
== 0) {
1672 free_context
= TRUE
;
1678 /* call_result == -1 means we had no trouble queueing up requests */
1679 if (call_result
== -1) {
1685 if (entryp_listp
!= NULL
) {
1686 FREE( entryp_listp
, M_TEMP
);
1688 if (aiocbpp
!= NULL
) {
1689 FREE( aiocbpp
, M_TEMP
);
1692 free_lio_context(lio_context
);
1695 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_END
,
1696 (int)p
, call_result
, 0, 0, 0 );
1703 * aio worker thread. this is where all the real work gets done.
1704 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1705 * after new work is queued up.
1707 __attribute__((noreturn
))
1709 aio_work_thread(void)
1711 aio_workq_entry
*entryp
;
1713 vm_map_t currentmap
;
1714 vm_map_t oldmap
= VM_MAP_NULL
;
1715 task_t oldaiotask
= TASK_NULL
;
1716 struct uthread
*uthreadp
= NULL
;
1720 * returns with the entry ref'ed.
1721 * sleeps until work is available.
1723 entryp
= aio_get_some_work();
1725 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_START
,
1726 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->flags
, 0, 0 );
1729 * Assume the target's address space identity for the duration
1730 * of the IO. Note: don't need to have the entryp locked,
1731 * because the proc and map don't change until it's freed.
1733 currentmap
= get_task_map((current_proc())->task
);
1734 if (currentmap
!= entryp
->aio_map
) {
1735 uthreadp
= (struct uthread
*) get_bsdthread_info(current_thread());
1736 oldaiotask
= uthreadp
->uu_aio_task
;
1737 uthreadp
->uu_aio_task
= entryp
->procp
->task
;
1738 oldmap
= vm_map_switch( entryp
->aio_map
);
1741 if ((entryp
->flags
& AIO_READ
) != 0) {
1742 error
= do_aio_read( entryp
);
1743 } else if ((entryp
->flags
& AIO_WRITE
) != 0) {
1744 error
= do_aio_write( entryp
);
1745 } else if ((entryp
->flags
& (AIO_FSYNC
| AIO_DSYNC
)) != 0) {
1746 error
= do_aio_fsync( entryp
);
1748 printf( "%s - unknown aio request - flags 0x%02X \n",
1749 __FUNCTION__
, entryp
->flags
);
1753 /* Restore old map */
1754 if (currentmap
!= entryp
->aio_map
) {
1755 (void) vm_map_switch( oldmap
);
1756 uthreadp
->uu_aio_task
= oldaiotask
;
1759 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_END
,
1760 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->errorval
,
1761 entryp
->returnval
, 0 );
1765 aio_entry_lock_spin(entryp
);
1766 entryp
->errorval
= error
;
1767 aio_entry_unlock(entryp
);
1769 /* we're done with the IO request so pop it off the active queue and */
1770 /* push it on the done queue */
1771 aio_proc_lock(entryp
->procp
);
1772 aio_proc_move_done_locked(entryp
->procp
, entryp
);
1773 aio_proc_unlock(entryp
->procp
);
1775 OSDecrementAtomic(&aio_anchor
.aio_inflight_count
);
1777 /* remove our reference to the user land map. */
1778 if (VM_MAP_NULL
!= entryp
->aio_map
) {
1781 my_map
= entryp
->aio_map
;
1782 entryp
->aio_map
= VM_MAP_NULL
;
1783 vm_map_deallocate( my_map
);
1786 /* Provide notifications */
1787 do_aio_completion( entryp
);
1789 /* Will free if needed */
1790 aio_entry_unref(entryp
);
1794 } /* aio_work_thread */
1798 * aio_get_some_work - get the next async IO request that is ready to be executed.
1799 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1800 * IO requests at the time the aio_fsync call came in have completed.
1801 * NOTE - AIO_LOCK must be held by caller
1803 static aio_workq_entry
*
1804 aio_get_some_work( void )
1806 aio_workq_entry
*entryp
= NULL
;
1807 aio_workq_t queue
= NULL
;
1809 /* Just one queue for the moment. In the future there will be many. */
1810 queue
= &aio_anchor
.aio_async_workqs
[0];
1811 aio_workq_lock_spin(queue
);
1812 if (queue
->aioq_count
== 0) {
1817 * Hold the queue lock.
1819 * pop some work off the work queue and add to our active queue
1820 * Always start with the queue lock held.
1824 * Pull of of work queue. Once it's off, it can't be cancelled,
1825 * so we can take our ref once we drop the queue lock.
1827 entryp
= TAILQ_FIRST(&queue
->aioq_entries
);
1830 * If there's no work or only fsyncs that need delay, go to sleep
1831 * and then start anew from aio_work_thread
1833 if (entryp
== NULL
) {
1837 aio_workq_remove_entry_locked(queue
, entryp
);
1839 aio_workq_unlock(queue
);
1842 * Check if it's an fsync that must be delayed. No need to lock the entry;
1843 * that flag would have been set at initialization.
1845 if ((entryp
->flags
& AIO_FSYNC
) != 0) {
1847 * Check for unfinished operations on the same file
1848 * in this proc's queue.
1850 aio_proc_lock_spin(entryp
->procp
);
1851 if (aio_delay_fsync_request( entryp
)) {
1852 /* It needs to be delayed. Put it back on the end of the work queue */
1853 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync_delay
)) | DBG_FUNC_NONE
,
1854 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1856 aio_proc_unlock(entryp
->procp
);
1858 aio_workq_lock_spin(queue
);
1859 aio_workq_add_entry_locked(queue
, entryp
);
1862 aio_proc_unlock(entryp
->procp
);
1868 aio_entry_ref(entryp
);
1870 OSIncrementAtomic(&aio_anchor
.aio_inflight_count
);
1874 /* We will wake up when someone enqueues something */
1875 waitq_assert_wait64(&queue
->aioq_waitq
, CAST_EVENT64_T(queue
), THREAD_UNINT
, 0);
1876 aio_workq_unlock(queue
);
1877 thread_block((thread_continue_t
)aio_work_thread
);
1884 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1885 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1886 * not been completed.
1889 aio_delay_fsync_request( aio_workq_entry
*entryp
)
1891 if (entryp
== TAILQ_FIRST(&entryp
->procp
->p_aio_activeq
)) {
1896 } /* aio_delay_fsync_request */
1898 static aio_workq_entry
*
1899 aio_create_queue_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
, int kindOfIO
)
1901 aio_workq_entry
*entryp
;
1904 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1905 if (entryp
== NULL
) {
1910 bzero( entryp
, sizeof(*entryp
));
1912 /* fill in the rest of the aio_workq_entry */
1913 entryp
->procp
= procp
;
1914 entryp
->uaiocbp
= aiocbp
;
1915 entryp
->flags
|= kindOfIO
;
1916 entryp
->group_tag
= group_tag
;
1917 entryp
->aio_map
= VM_MAP_NULL
;
1918 entryp
->aio_refcount
= 0;
1920 if (proc_is64bit(procp
)) {
1921 struct user64_aiocb aiocb64
;
1923 result
= copyin( aiocbp
, &aiocb64
, sizeof(aiocb64
));
1925 do_munge_aiocb_user64_to_user(&aiocb64
, &entryp
->aiocb
);
1928 struct user32_aiocb aiocb32
;
1930 result
= copyin( aiocbp
, &aiocb32
, sizeof(aiocb32
));
1932 do_munge_aiocb_user32_to_user( &aiocb32
, &entryp
->aiocb
);
1941 /* get a reference to the user land map in order to keep it around */
1942 entryp
->aio_map
= get_task_map( procp
->task
);
1943 vm_map_reference( entryp
->aio_map
);
1945 /* do some more validation on the aiocb and embedded file descriptor */
1946 result
= aio_validate( entryp
);
1948 goto error_exit_with_ref
;
1951 /* get a reference on the current_thread, which is passed in vfs_context. */
1952 entryp
->thread
= current_thread();
1953 thread_reference( entryp
->thread
);
1956 error_exit_with_ref
:
1957 if (VM_MAP_NULL
!= entryp
->aio_map
) {
1958 vm_map_deallocate( entryp
->aio_map
);
1961 if (result
&& entryp
!= NULL
) {
1962 zfree( aio_workq_zonep
, entryp
);
1971 * aio_queue_async_request - queue up an async IO request on our work queue then
1972 * wake up one of our worker threads to do the actual work. We get a reference
1973 * to our caller's user land map in order to keep it around while we are
1974 * processing the request.
1977 aio_queue_async_request(proc_t procp
, user_addr_t aiocbp
, int kindOfIO
)
1979 aio_workq_entry
*entryp
;
1982 uint32_t *paio_offset
;
1983 uint32_t *paio_nbytes
;
1985 old_count
= aio_increment_total_count();
1986 if (old_count
>= aio_max_requests
) {
1991 entryp
= aio_create_queue_entry( procp
, aiocbp
, 0, kindOfIO
);
1992 if (entryp
== NULL
) {
1998 aio_proc_lock_spin(procp
);
2000 if (is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
2005 /* check our aio limits to throttle bad or rude user land behavior */
2006 if (aio_get_process_count( procp
) >= aio_max_requests_per_process
) {
2007 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp
->p_aio_total_count
);
2012 /* Add the IO to proc and work queues, wake up threads as appropriate */
2013 lck_mtx_convert_spin(aio_proc_mutex(procp
));
2014 aio_enqueue_work(procp
, entryp
, 1);
2016 aio_proc_unlock(procp
);
2018 paio_offset
= (uint32_t*) &entryp
->aiocb
.aio_offset
;
2019 paio_nbytes
= (uint32_t*) &entryp
->aiocb
.aio_nbytes
;
2020 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_START
,
2021 (int)procp
, (int)aiocbp
, entryp
->flags
, entryp
->aiocb
.aio_fildes
, 0 );
2022 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_END
,
2023 paio_offset
[0], (sizeof(entryp
->aiocb
.aio_offset
) == sizeof(uint64_t) ? paio_offset
[1] : 0),
2024 paio_nbytes
[0], (sizeof(entryp
->aiocb
.aio_nbytes
) == sizeof(uint64_t) ? paio_nbytes
[1] : 0),
2031 * This entry has not been queued up so no worries about
2032 * unlocked state and aio_map
2034 aio_proc_unlock(procp
);
2035 aio_free_request(entryp
);
2038 aio_decrement_total_count();
2041 } /* aio_queue_async_request */
2047 * Allocate an aio_workq_entry and fill it in. If all goes well return 0
2048 * and pass the aio_workq_entry pointer back to our caller.
2050 * Parameters: procp The process makign the request
2051 * aiocbp The aio context buffer pointer
2052 * group_tag The group tag used to indicate a
2053 * group of operations has completed
2054 * entrypp Pointer to the pointer to receive the
2055 * address of the created aio_workq_entry
2057 * Returns: 0 Successfully created
2058 * EAGAIN Try again (usually resource shortage)
2061 * Notes: We get a reference to our caller's user land map in order
2062 * to keep it around while we are processing the request.
2064 * lio_listio calls behave differently at completion they do
2065 * completion notification when all async IO requests have
2066 * completed. We use group_tag to tag IO requests that behave
2067 * in the delay notification manner.
2069 * All synchronous operations are considered to not have a
2070 * signal routine associated with them (sigp == USER_ADDR_NULL).
2073 lio_create_entry(proc_t procp
, user_addr_t aiocbp
, void *group_tag
,
2074 aio_workq_entry
**entrypp
)
2076 aio_workq_entry
*entryp
;
2079 entryp
= aio_create_queue_entry( procp
, aiocbp
, group_tag
, AIO_LIO
);
2080 if (entryp
== NULL
) {
2086 * Look for lio_listio LIO_NOP requests and ignore them; this is
2087 * not really an error, but we need to free our aio_workq_entry.
2089 if (entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
2099 if (entryp
!= NULL
) {
2101 * This entry has not been queued up so no worries about
2102 * unlocked state and aio_map
2104 aio_free_request(entryp
);
2108 } /* lio_create_entry */
2112 * aio_free_request - remove our reference on the user land map and
2113 * free the work queue entry resources. The entry is off all lists
2114 * and has zero refcount, so no one can have a pointer to it.
2118 aio_free_request(aio_workq_entry
*entryp
)
2120 /* remove our reference to the user land map. */
2121 if (VM_MAP_NULL
!= entryp
->aio_map
) {
2122 vm_map_deallocate(entryp
->aio_map
);
2125 /* remove our reference to thread which enqueued the request */
2126 if (NULL
!= entryp
->thread
) {
2127 thread_deallocate( entryp
->thread
);
2130 entryp
->aio_refcount
= -1; /* A bit of poisoning in case of bad refcounting. */
2132 zfree( aio_workq_zonep
, entryp
);
2135 } /* aio_free_request */
2141 * validate the aiocb passed in by one of the aio syscalls.
2144 aio_validate( aio_workq_entry
*entryp
)
2146 struct fileproc
*fp
;
2152 if ((entryp
->flags
& AIO_LIO
) != 0) {
2153 if (entryp
->aiocb
.aio_lio_opcode
== LIO_READ
) {
2154 entryp
->flags
|= AIO_READ
;
2155 } else if (entryp
->aiocb
.aio_lio_opcode
== LIO_WRITE
) {
2156 entryp
->flags
|= AIO_WRITE
;
2157 } else if (entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
2165 if ((entryp
->flags
& (AIO_WRITE
| AIO_FSYNC
| AIO_DSYNC
)) != 0) {
2169 if ((entryp
->flags
& (AIO_READ
| AIO_WRITE
)) != 0) {
2170 if (entryp
->aiocb
.aio_nbytes
> INT_MAX
||
2171 entryp
->aiocb
.aio_buf
== USER_ADDR_NULL
||
2172 entryp
->aiocb
.aio_offset
< 0) {
2177 result
= aio_sigev_validate(&entryp
->aiocb
.aio_sigevent
);
2182 /* validate the file descriptor and that the file was opened
2183 * for the appropriate read / write access.
2185 proc_fdlock(entryp
->procp
);
2187 result
= fp_lookup( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 1);
2189 if ((fp
->f_fglob
->fg_flag
& flag
) == 0) {
2190 /* we don't have read or write access */
2192 } else if (FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_VNODE
) {
2193 /* this is not a file */
2196 fp
->f_flags
|= FP_AIOISSUED
;
2199 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 1);
2204 proc_fdunlock(entryp
->procp
);
2207 } /* aio_validate */
2210 aio_increment_total_count()
2212 return OSIncrementAtomic(&aio_anchor
.aio_total_count
);
2216 aio_decrement_total_count()
2218 int old
= OSDecrementAtomic(&aio_anchor
.aio_total_count
);
2220 panic("Negative total AIO count!\n");
2227 aio_get_process_count(proc_t procp
)
2229 return procp
->p_aio_total_count
;
2230 } /* aio_get_process_count */
2233 aio_get_all_queues_count( void )
2235 return aio_anchor
.aio_total_count
;
2236 } /* aio_get_all_queues_count */
2240 * do_aio_completion. Handle async IO completion.
2243 do_aio_completion( aio_workq_entry
*entryp
)
2245 boolean_t lastLioCompleted
= FALSE
;
2246 aio_lio_context
*lio_context
= NULL
;
2249 lio_context
= (aio_lio_context
*)entryp
->group_tag
;
2251 if (lio_context
!= NULL
) {
2252 aio_proc_lock_spin(entryp
->procp
);
2254 /* Account for this I/O completing. */
2255 lio_context
->io_completed
++;
2257 /* Are we done with this lio context? */
2258 if (lio_context
->io_issued
== lio_context
->io_completed
) {
2259 lastLioCompleted
= TRUE
;
2262 waiter
= lio_context
->io_waiter
;
2264 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2265 if ((entryp
->flags
& AIO_LIO_NOTIFY
) && (lastLioCompleted
) && (waiter
!= 0)) {
2266 /* wake up the waiter */
2267 wakeup(lio_context
);
2270 aio_proc_unlock(entryp
->procp
);
2273 if (entryp
->aiocb
.aio_sigevent
.sigev_notify
== SIGEV_SIGNAL
&&
2274 (entryp
->flags
& AIO_DISABLE
) == 0) {
2275 boolean_t performSignal
= FALSE
;
2276 if (lio_context
== NULL
) {
2277 performSignal
= TRUE
;
2280 * If this was the last request in the group and a signal
2281 * is desired, send one.
2283 performSignal
= lastLioCompleted
;
2286 if (performSignal
) {
2287 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_sig
)) | DBG_FUNC_NONE
,
2288 (int)entryp
->procp
, (int)entryp
->uaiocbp
,
2289 entryp
->aiocb
.aio_sigevent
.sigev_signo
, 0, 0 );
2291 psignal( entryp
->procp
, entryp
->aiocb
.aio_sigevent
.sigev_signo
);
2295 if ((entryp
->flags
& AIO_EXIT_WAIT
) && (entryp
->flags
& AIO_CLOSE_WAIT
)) {
2296 panic("Close and exit flags set at the same time\n");
2300 * need to handle case where a process is trying to exit, exec, or
2301 * close and is currently waiting for active aio requests to complete.
2302 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2303 * other requests in the active queue for this process. If there are
2304 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2305 * If there are some still active then do nothing - we only want to
2306 * wakeup when all active aio requests for the process are complete.
2308 * Don't need to lock the entry or proc to check the cleanup flag. It can only be
2309 * set for cancellation, while the entryp is still on a proc list; now it's
2310 * off, so that flag is already set if it's going to be.
2312 if ((entryp
->flags
& AIO_EXIT_WAIT
) != 0) {
2313 int active_requests
;
2315 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2316 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2318 aio_proc_lock_spin(entryp
->procp
);
2319 active_requests
= aio_active_requests_for_process( entryp
->procp
);
2320 if (active_requests
< 1) {
2322 * no active aio requests for this process, continue exiting. In this
2323 * case, there should be no one else waiting ont he proc in AIO...
2325 wakeup_one((caddr_t
)&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2326 aio_proc_unlock(entryp
->procp
);
2328 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2329 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2331 aio_proc_unlock(entryp
->procp
);
2335 if ((entryp
->flags
& AIO_CLOSE_WAIT
) != 0) {
2336 int active_requests
;
2338 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
2339 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2341 aio_proc_lock_spin(entryp
->procp
);
2342 active_requests
= aio_proc_active_requests_for_file( entryp
->procp
, entryp
->aiocb
.aio_fildes
);
2343 if (active_requests
< 1) {
2344 /* Can't wakeup_one(); multiple closes might be in progress. */
2345 wakeup(&entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
2346 aio_proc_unlock(entryp
->procp
);
2348 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
2349 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2351 aio_proc_unlock(entryp
->procp
);
2355 * A thread in aio_suspend() wants to known about completed IOs. If it checked
2356 * the done list before we moved our AIO there, then it already asserted its wait,
2357 * and we can wake it up without holding the lock. If it checked the list after
2358 * we did our move, then it already has seen the AIO that we moved. Herego, we
2359 * can do our wakeup without holding the lock.
2361 wakeup((caddr_t
) &entryp
->procp
->AIO_SUSPEND_SLEEP_CHAN
);
2362 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_suspend_wake
)) | DBG_FUNC_NONE
,
2363 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
2366 * free the LIO context if the last lio completed and no thread is
2369 if (lastLioCompleted
&& (waiter
== 0)) {
2370 free_lio_context(lio_context
);
2372 } /* do_aio_completion */
2379 do_aio_read( aio_workq_entry
*entryp
)
2381 struct fileproc
*fp
;
2383 struct vfs_context context
;
2385 if ((error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0))) {
2388 if ((fp
->f_fglob
->fg_flag
& FREAD
) == 0) {
2389 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2393 context
.vc_thread
= entryp
->thread
; /* XXX */
2394 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2396 error
= dofileread(&context
, fp
,
2397 entryp
->aiocb
.aio_buf
,
2398 entryp
->aiocb
.aio_nbytes
,
2399 entryp
->aiocb
.aio_offset
, FOF_OFFSET
,
2400 &entryp
->returnval
);
2401 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2411 do_aio_write( aio_workq_entry
*entryp
)
2413 struct fileproc
*fp
;
2415 struct vfs_context context
;
2417 if ((error
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, 0))) {
2420 if ((fp
->f_fglob
->fg_flag
& FWRITE
) == 0) {
2421 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2426 if ((fp
->f_fglob
->fg_flag
& O_APPEND
) == 0) {
2427 flags
|= FOF_OFFSET
;
2430 context
.vc_thread
= entryp
->thread
; /* XXX */
2431 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2433 /* NB: tell dofilewrite the offset, and to use the proc cred */
2434 error
= dofilewrite(&context
,
2436 entryp
->aiocb
.aio_buf
,
2437 entryp
->aiocb
.aio_nbytes
,
2438 entryp
->aiocb
.aio_offset
,
2440 &entryp
->returnval
);
2442 if (entryp
->returnval
) {
2443 fp_drop_written(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
);
2445 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2449 } /* do_aio_write */
2453 * aio_active_requests_for_process - return number of active async IO
2454 * requests for the given process.
2457 aio_active_requests_for_process(proc_t procp
)
2459 return procp
->p_aio_active_count
;
2460 } /* aio_active_requests_for_process */
2463 * Called with the proc locked.
2466 aio_proc_active_requests_for_file(proc_t procp
, int fd
)
2469 aio_workq_entry
*entryp
;
2470 TAILQ_FOREACH(entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2471 if (entryp
->aiocb
.aio_fildes
== fd
) {
2477 } /* aio_active_requests_for_process */
2485 do_aio_fsync( aio_workq_entry
*entryp
)
2487 struct vfs_context context
;
2489 struct fileproc
*fp
;
2494 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2496 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2497 * to mark for update the metadata not strictly necessary for data
2498 * retrieval, rather than forcing it to disk.
2500 * If AIO_FSYNC is set, we have to also wait for metadata not really
2501 * necessary to data retrival are committed to stable storage (e.g.
2502 * atime, mtime, ctime, etc.).
2504 * Metadata necessary for data retrieval ust be committed to stable
2505 * storage in either case (file length, etc.).
2507 if (entryp
->flags
& AIO_FSYNC
) {
2508 sync_flag
= MNT_WAIT
;
2510 sync_flag
= MNT_DWAIT
;
2513 error
= fp_getfvp( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, &vp
);
2515 if ((error
= vnode_getwithref(vp
))) {
2516 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2517 entryp
->returnval
= -1;
2520 context
.vc_thread
= current_thread();
2521 context
.vc_ucred
= fp
->f_fglob
->fg_cred
;
2523 error
= VNOP_FSYNC( vp
, sync_flag
, &context
);
2525 (void)vnode_put(vp
);
2527 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0);
2530 entryp
->returnval
= -1;
2534 } /* do_aio_fsync */
2538 * is_already_queued - runs through our queues to see if the given
2539 * aiocbp / process is there. Returns TRUE if there is a match
2540 * on any of our aio queues.
2542 * Called with proc aio lock held (can be held spin)
2545 is_already_queued(proc_t procp
,
2546 user_addr_t aiocbp
)
2548 aio_workq_entry
*entryp
;
2553 /* look for matches on our queue of async IO requests that have completed */
2554 TAILQ_FOREACH( entryp
, &procp
->p_aio_doneq
, aio_proc_link
) {
2555 if (aiocbp
== entryp
->uaiocbp
) {
2557 goto ExitThisRoutine
;
2561 /* look for matches on our queue of active async IO requests */
2562 TAILQ_FOREACH( entryp
, &procp
->p_aio_activeq
, aio_proc_link
) {
2563 if (aiocbp
== entryp
->uaiocbp
) {
2565 goto ExitThisRoutine
;
2571 } /* is_already_queued */
2575 free_lio_context(aio_lio_context
* context
)
2578 OSDecrementAtomic(&lio_contexts_alloced
);
2581 FREE( context
, M_TEMP
);
2582 } /* free_lio_context */
2586 * aio initialization
2588 __private_extern__
void
2593 aio_lock_grp_attr
= lck_grp_attr_alloc_init();
2594 aio_proc_lock_grp
= lck_grp_alloc_init("aio_proc", aio_lock_grp_attr
);;
2595 aio_entry_lock_grp
= lck_grp_alloc_init("aio_entry", aio_lock_grp_attr
);;
2596 aio_queue_lock_grp
= lck_grp_alloc_init("aio_queue", aio_lock_grp_attr
);;
2597 aio_lock_attr
= lck_attr_alloc_init();
2599 lck_mtx_init(&aio_entry_mtx
, aio_entry_lock_grp
, aio_lock_attr
);
2600 lck_mtx_init(&aio_proc_mtx
, aio_proc_lock_grp
, aio_lock_attr
);
2602 aio_anchor
.aio_inflight_count
= 0;
2603 aio_anchor
.aio_done_count
= 0;
2604 aio_anchor
.aio_total_count
= 0;
2605 aio_anchor
.aio_num_workqs
= AIO_NUM_WORK_QUEUES
;
2607 for (i
= 0; i
< AIO_NUM_WORK_QUEUES
; i
++) {
2608 aio_workq_init(&aio_anchor
.aio_async_workqs
[i
]);
2612 i
= sizeof(aio_workq_entry
);
2613 aio_workq_zonep
= zinit( i
, i
* aio_max_requests
, i
* aio_max_requests
, "aiowq" );
2615 _aio_create_worker_threads( aio_worker_threads
);
2620 * aio worker threads created here.
2622 __private_extern__
void
2623 _aio_create_worker_threads( int num
)
2627 /* create some worker threads to handle the async IO requests */
2628 for (i
= 0; i
< num
; i
++) {
2631 if (KERN_SUCCESS
!= kernel_thread_start((thread_continue_t
)aio_work_thread
, NULL
, &myThread
)) {
2632 printf( "%s - failed to create a work thread \n", __FUNCTION__
);
2634 thread_deallocate(myThread
);
2639 } /* _aio_create_worker_threads */
2642 * Return the current activation utask
2647 return ((struct uthread
*)get_bsdthread_info(current_thread()))->uu_aio_task
;
2652 * In the case of an aiocb from a
2653 * 32-bit process we need to expand some longs and pointers to the correct
2654 * sizes in order to let downstream code always work on the same type of
2655 * aiocb (in our case that is a user_aiocb)
2658 do_munge_aiocb_user32_to_user( struct user32_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2660 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2661 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2662 the_user_aiocbp
->aio_buf
= CAST_USER_ADDR_T(my_aiocbp
->aio_buf
);
2663 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2664 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2665 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2667 /* special case here. since we do not know if sigev_value is an */
2668 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2669 /* means if we send this info back to user space we need to remember */
2670 /* sigev_value was not expanded for the 32-bit case. */
2671 /* NOTE - this does NOT affect us since we don't support sigev_value */
2672 /* yet in the aio context. */
2674 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2675 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2676 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2677 my_aiocbp
->aio_sigevent
.sigev_value
.sival_int
;
2678 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2679 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_function
);
2680 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2681 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_attributes
);
2684 /* Similar for 64-bit user process, so that we don't need to satisfy
2685 * the alignment constraints of the original user64_aiocb
2688 do_munge_aiocb_user64_to_user( struct user64_aiocb
*my_aiocbp
, struct user_aiocb
*the_user_aiocbp
)
2690 the_user_aiocbp
->aio_fildes
= my_aiocbp
->aio_fildes
;
2691 the_user_aiocbp
->aio_offset
= my_aiocbp
->aio_offset
;
2692 the_user_aiocbp
->aio_buf
= my_aiocbp
->aio_buf
;
2693 the_user_aiocbp
->aio_nbytes
= my_aiocbp
->aio_nbytes
;
2694 the_user_aiocbp
->aio_reqprio
= my_aiocbp
->aio_reqprio
;
2695 the_user_aiocbp
->aio_lio_opcode
= my_aiocbp
->aio_lio_opcode
;
2697 the_user_aiocbp
->aio_sigevent
.sigev_notify
= my_aiocbp
->aio_sigevent
.sigev_notify
;
2698 the_user_aiocbp
->aio_sigevent
.sigev_signo
= my_aiocbp
->aio_sigevent
.sigev_signo
;
2699 the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
=
2700 my_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int
;
2701 the_user_aiocbp
->aio_sigevent
.sigev_notify_function
=
2702 my_aiocbp
->aio_sigevent
.sigev_notify_function
;
2703 the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes
=
2704 my_aiocbp
->aio_sigevent
.sigev_notify_attributes
;