git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2016 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29
	30	/*
	31	* todo:
	32	* 1) ramesh is looking into how to replace taking a reference on
	33	* the user's map (vm_map_reference()) since it is believed that
	34	* would not hold the process for us.
	35	* 2) david is looking into a way for us to set the priority of the
	36	* worker threads to match that of the user's thread when the
	37	* async IO was queued.
	38	*/
	39
	40
	41	/*
	42	* This file contains support for the POSIX 1003.1B AIO/LIO facility.
	43	*/
	44
	45	#include <sys/systm.h>
	46	#include <sys/fcntl.h>
	47	#include <sys/file_internal.h>
	48	#include <sys/filedesc.h>
	49	#include <sys/kernel.h>
	50	#include <sys/vnode_internal.h>
	51	#include <sys/malloc.h>
	52	#include <sys/mount_internal.h>
	53	#include <sys/param.h>
	54	#include <sys/proc_internal.h>
	55	#include <sys/sysctl.h>
	56	#include <sys/unistd.h>
	57	#include <sys/user.h>
	58
	59	#include <sys/aio_kern.h>
	60	#include <sys/sysproto.h>
	61
	62	#include <machine/limits.h>
	63
	64	#include <mach/mach_types.h>
	65	#include <kern/kern_types.h>
	66	#include <kern/waitq.h>
	67	#include <kern/zalloc.h>
	68	#include <kern/task.h>
	69	#include <kern/sched_prim.h>
	70
	71	#include <vm/vm_map.h>
	72
	73	#include <libkern/OSAtomic.h>
	74
	75	#include <sys/kdebug.h>
	76	#define AIO_work_queued 1
	77	#define AIO_worker_wake 2
	78	#define AIO_completion_sig 3
	79	#define AIO_completion_cleanup_wait 4
	80	#define AIO_completion_cleanup_wake 5
	81	#define AIO_completion_suspend_wake 6
	82	#define AIO_fsync_delay 7
	83	#define AIO_cancel 10
	84	#define AIO_cancel_async_workq 11
	85	#define AIO_cancel_sync_workq 12
	86	#define AIO_cancel_activeq 13
	87	#define AIO_cancel_doneq 14
	88	#define AIO_fsync 20
	89	#define AIO_read 30
	90	#define AIO_write 40
	91	#define AIO_listio 50
	92	#define AIO_error 60
	93	#define AIO_error_val 61
	94	#define AIO_error_activeq 62
	95	#define AIO_error_workq 63
	96	#define AIO_return 70
	97	#define AIO_return_val 71
	98	#define AIO_return_activeq 72
	99	#define AIO_return_workq 73
	100	#define AIO_exec 80
	101	#define AIO_exit 90
	102	#define AIO_exit_sleep 91
	103	#define AIO_close 100
	104	#define AIO_close_sleep 101
	105	#define AIO_suspend 110
	106	#define AIO_suspend_sleep 111
	107	#define AIO_worker_thread 120
	108
	109	#if 0
	110	#undef KERNEL_DEBUG
	111	#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
	112	#endif
	113
	114	/*
	115	* aio requests queue up on the aio_async_workq or lio_sync_workq (for
	116	* lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
	117	* (proc.aio_activeq) when one of our worker threads start the IO.
	118	* And finally, requests move to the per process aio_doneq (proc.aio_doneq)
	119	* when the IO request completes. The request remains on aio_doneq until
	120	* user process calls aio_return or the process exits, either way that is our
	121	* trigger to release aio resources.
	122	*/
	123	typedef struct aio_workq {
	124	TAILQ_HEAD(, aio_workq_entry) aioq_entries;
	125	int aioq_count;
	126	lck_mtx_t aioq_mtx;
	127	struct waitq aioq_waitq;
	128	} *aio_workq_t;
	129
	130	#define AIO_NUM_WORK_QUEUES 1
	131	struct aio_anchor_cb
	132	{
	133	volatile int32_t aio_inflight_count; /* entries that have been taken from a workq */
	134	volatile int32_t aio_done_count; /* entries on all done queues (proc.aio_doneq) */
	135	volatile int32_t aio_total_count; /* total extant entries */
	136
	137	/* Hash table of queues here */
	138	int aio_num_workqs;
	139	struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
	140	};
	141	typedef struct aio_anchor_cb aio_anchor_cb;
	142
	143	struct aio_lio_context
	144	{
	145	int io_waiter;
	146	int io_issued;
	147	int io_completed;
	148	};
	149	typedef struct aio_lio_context aio_lio_context;
	150
	151
	152	/*
	153	* Notes on aio sleep / wake channels.
	154	* We currently pick a couple fields within the proc structure that will allow
	155	* us sleep channels that currently do not collide with any other kernel routines.
	156	* At this time, for binary compatibility reasons, we cannot create new proc fields.
	157	*/
	158	#define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
	159	#define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
	160
	161	#define ASSERT_AIO_FROM_PROC(aiop, theproc) \
	162	if ((aiop)->procp != (theproc)) { \
	163	panic("AIO on a proc list that does not belong to that proc.\n"); \
	164	}
	165
	166	/*
	167	* LOCAL PROTOTYPES
	168	*/
	169	static void aio_proc_lock(proc_t procp);
	170	static void aio_proc_lock_spin(proc_t procp);
	171	static void aio_proc_unlock(proc_t procp);
	172	static lck_mtx_t* aio_proc_mutex(proc_t procp);
	173	static void aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
	174	static void aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
	175	static int aio_get_process_count(proc_t procp );
	176	static int aio_active_requests_for_process(proc_t procp );
	177	static int aio_proc_active_requests_for_file(proc_t procp, int fd);
	178	static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp );
	179	static boolean_t should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
	180
	181	static void aio_entry_lock(aio_workq_entry *entryp);
	182	static void aio_entry_lock_spin(aio_workq_entry *entryp);
	183	static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
	184	static lck_mtx_t* aio_entry_mutex(__unused aio_workq_entry *entryp);
	185	static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
	186	static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
	187	static void aio_entry_ref_locked(aio_workq_entry *entryp);
	188	static void aio_entry_unref_locked(aio_workq_entry *entryp);
	189	static void aio_entry_ref(aio_workq_entry *entryp);
	190	static void aio_entry_unref(aio_workq_entry *entryp);
	191	static void aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
	192	int wait_for_completion, boolean_t disable_notification);
	193	static int aio_entry_try_workq_remove(aio_workq_entry *entryp);
	194	static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
	195	static int aio_free_request(aio_workq_entry *entryp);
	196
	197	static void aio_workq_init(aio_workq_t wq);
	198	static void aio_workq_lock_spin(aio_workq_t wq);
	199	static void aio_workq_unlock(aio_workq_t wq);
	200	static lck_mtx_t* aio_workq_mutex(aio_workq_t wq);
	201
	202	static void aio_work_thread( void );
	203	static aio_workq_entry *aio_get_some_work( void );
	204
	205	static int aio_get_all_queues_count( void );
	206	static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
	207	static int aio_validate( aio_workq_entry *entryp );
	208	static int aio_increment_total_count(void);
	209	static int aio_decrement_total_count(void);
	210
	211	static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
	212	static void do_aio_completion( aio_workq_entry *entryp );
	213	static int do_aio_fsync( aio_workq_entry *entryp );
	214	static int do_aio_read( aio_workq_entry *entryp );
	215	static int do_aio_write( aio_workq_entry *entryp );
	216	static void do_munge_aiocb_user32_to_user( struct user32_aiocb my_aiocbp, struct user_aiocb the_user_aiocbp );
	217	static void do_munge_aiocb_user64_to_user( struct user64_aiocb my_aiocbp, struct user_aiocb the_user_aiocbp );
	218	static int lio_create_entry(proc_t procp,
	219	user_addr_t aiocbp,
	220	void *group_tag,
	221	aio_workq_entry **entrypp );
	222	static aio_workq_entry *aio_create_queue_entry(proc_t procp,
	223	user_addr_t aiocbp,
	224	void *group_tag,
	225	int kindOfIO);
	226	static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
	227	static void free_lio_context(aio_lio_context* context);
	228	static void aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
	229
	230	#define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
	231	#define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
	232	#define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
	233
	234	/*
	235	* EXTERNAL PROTOTYPES
	236	*/
	237
	238	/* in ...bsd/kern/sys_generic.c */
	239	extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
	240	user_addr_t bufp, user_size_t nbyte,
	241	off_t offset, int flags, user_ssize_t *retval );
	242	extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
	243	user_addr_t bufp, user_size_t nbyte, off_t offset,
	244	int flags, user_ssize_t *retval );
	245	#if DEBUG
	246	static uint32_t lio_contexts_alloced = 0;
	247	#endif /* DEBUG */
	248
	249	/*
	250	* aio external global variables.
	251	*/
	252	extern int aio_max_requests; /* AIO_MAX - configurable */
	253	extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
	254	extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
	255
	256
	257	/*
	258	* aio static variables.
	259	*/
	260	static aio_anchor_cb aio_anchor;
	261	static lck_grp_t *aio_proc_lock_grp;
	262	static lck_grp_t *aio_entry_lock_grp;
	263	static lck_grp_t *aio_queue_lock_grp;
	264	static lck_attr_t *aio_lock_attr;
	265	static lck_grp_attr_t *aio_lock_grp_attr;
	266	static struct zone *aio_workq_zonep;
	267	static lck_mtx_t aio_entry_mtx;
	268	static lck_mtx_t aio_proc_mtx;
	269
	270	static void
	271	aio_entry_lock(__unused aio_workq_entry *entryp)
	272	{
	273	lck_mtx_lock(&aio_entry_mtx);
	274	}
	275
	276	static void
	277	aio_entry_lock_spin(__unused aio_workq_entry *entryp)
	278	{
	279	lck_mtx_lock_spin(&aio_entry_mtx);
	280	}
	281
	282	static void
	283	aio_entry_unlock(__unused aio_workq_entry *entryp)
	284	{
	285	lck_mtx_unlock(&aio_entry_mtx);
	286	}
	287
	288	/* Hash */
	289	static aio_workq_t
	290	aio_entry_workq(__unused aio_workq_entry *entryp)
	291	{
	292	return &aio_anchor.aio_async_workqs[0];
	293	}
	294
	295	static lck_mtx_t*
	296	aio_entry_mutex(__unused aio_workq_entry *entryp)
	297	{
	298	return &aio_entry_mtx;
	299	}
	300
	301	static void
	302	aio_workq_init(aio_workq_t wq)
	303	{
	304	TAILQ_INIT(&wq->aioq_entries);
	305	wq->aioq_count = 0;
	306	lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
	307	waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
	308	}
	309
	310
	311	/*
	312	* Can be passed a queue which is locked spin.
	313	*/
	314	static void
	315	aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
	316	{
	317	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
	318
	319	if (entryp->aio_workq_link.tqe_prev == NULL) {
	320	panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
	321	}
	322
	323	TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
	324	queue->aioq_count--;
	325	entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
	326
	327	if (queue->aioq_count < 0) {
	328	panic("Negative count on a queue.\n");
	329	}
	330	}
	331
	332	static void
	333	aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
	334	{
	335	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
	336
	337	TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
	338	if (queue->aioq_count < 0) {
	339	panic("Negative count on a queue.\n");
	340	}
	341	queue->aioq_count++;
	342	}
	343
	344	static void
	345	aio_proc_lock(proc_t procp)
	346	{
	347	lck_mtx_lock(aio_proc_mutex(procp));
	348	}
	349
	350	static void
	351	aio_proc_lock_spin(proc_t procp)
	352	{
	353	lck_mtx_lock_spin(aio_proc_mutex(procp));
	354	}
	355
	356	static void
	357	aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
	358	{
	359	ASSERT_AIO_PROC_LOCK_OWNED(procp);
	360
	361	TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
	362	TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
	363	procp->p_aio_active_count--;
	364	OSIncrementAtomic(&aio_anchor.aio_done_count);
	365	}
	366
	367	static void
	368	aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
	369	{
	370	TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
	371	OSDecrementAtomic(&aio_anchor.aio_done_count);
	372	aio_decrement_total_count();
	373	procp->p_aio_total_count--;
	374	}
	375
	376	static void
	377	aio_proc_unlock(proc_t procp)
	378	{
	379	lck_mtx_unlock(aio_proc_mutex(procp));
	380	}
	381
	382	static lck_mtx_t*
	383	aio_proc_mutex(proc_t procp)
	384	{
	385	return &procp->p_mlock;
	386	}
	387
	388	static void
	389	aio_entry_ref_locked(aio_workq_entry *entryp)
	390	{
	391	ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
	392
	393	if (entryp->aio_refcount < 0) {
	394	panic("AIO workq entry with a negative refcount.\n");
	395	}
	396	entryp->aio_refcount++;
	397	}
	398
	399
	400	/* Return 1 if you've freed it */
	401	static void
	402	aio_entry_unref_locked(aio_workq_entry *entryp)
	403	{
	404	ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
	405
	406	entryp->aio_refcount--;
	407	if (entryp->aio_refcount < 0) {
	408	panic("AIO workq entry with a negative refcount.\n");
	409	}
	410	}
	411
	412	static void
	413	aio_entry_ref(aio_workq_entry *entryp)
	414	{
	415	aio_entry_lock_spin(entryp);
	416	aio_entry_ref_locked(entryp);
	417	aio_entry_unlock(entryp);
	418	}
	419	static void
	420	aio_entry_unref(aio_workq_entry *entryp)
	421	{
	422	aio_entry_lock_spin(entryp);
	423	aio_entry_unref_locked(entryp);
	424
	425	if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
	426	aio_entry_unlock(entryp);
	427	aio_free_request(entryp);
	428	} else {
	429	aio_entry_unlock(entryp);
	430	}
	431
	432	return;
	433	}
	434
	435	static void
	436	aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
	437	{
	438	aio_entry_lock_spin(entryp);
	439
	440	if (cancelled) {
	441	aio_entry_ref_locked(entryp);
	442	entryp->errorval = ECANCELED;
	443	entryp->returnval = -1;
	444	}
	445
	446	if ( wait_for_completion ) {
	447	entryp->flags \|= wait_for_completion; /* flag for special completion processing */
	448	}
	449
	450	if ( disable_notification ) {
	451	entryp->flags \|= AIO_DISABLE; /* Don't want a signal */
	452	}
	453
	454	aio_entry_unlock(entryp);
	455	}
	456
	457	static int
	458	aio_entry_try_workq_remove(aio_workq_entry *entryp)
	459	{
	460	/* Can only be cancelled if it's still on a work queue */
	461	if (entryp->aio_workq_link.tqe_prev != NULL) {
	462	aio_workq_t queue;
	463
	464	/* Will have to check again under the lock */
	465	queue = aio_entry_workq(entryp);
	466	aio_workq_lock_spin(queue);
	467	if (entryp->aio_workq_link.tqe_prev != NULL) {
	468	aio_workq_remove_entry_locked(queue, entryp);
	469	aio_workq_unlock(queue);
	470	return 1;
	471	} else {
	472	aio_workq_unlock(queue);
	473	}
	474	}
	475
	476	return 0;
	477	}
	478
	479	static void
	480	aio_workq_lock_spin(aio_workq_t wq)
	481	{
	482	lck_mtx_lock_spin(aio_workq_mutex(wq));
	483	}
	484
	485	static void
	486	aio_workq_unlock(aio_workq_t wq)
	487	{
	488	lck_mtx_unlock(aio_workq_mutex(wq));
	489	}
	490
	491	static lck_mtx_t*
	492	aio_workq_mutex(aio_workq_t wq)
	493	{
	494	return &wq->aioq_mtx;
	495	}
	496
	497	/*
	498	* aio_cancel - attempt to cancel one or more async IO requests currently
	499	* outstanding against file descriptor uap->fd. If uap->aiocbp is not
	500	* NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
	501	* is NULL then all outstanding async IO request for the given file
	502	* descriptor are cancelled (if possible).
	503	*/
	504	int
	505	aio_cancel(proc_t p, struct aio_cancel_args uap, int retval )
	506	{
	507	struct user_aiocb my_aiocb;
	508	int result;
	509
	510	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) \| DBG_FUNC_START,
	511	(int)p, (int)uap->aiocbp, 0, 0, 0 );
	512
	513	/* quick check to see if there are any async IO requests queued up */
	514	if (aio_get_all_queues_count() < 1) {
	515	result = 0;
	516	*retval = AIO_ALLDONE;
	517	goto ExitRoutine;
	518	}
	519
	520	*retval = -1;
	521	if ( uap->aiocbp != USER_ADDR_NULL ) {
	522	if ( proc_is64bit(p) ) {
	523	struct user64_aiocb aiocb64;
	524
	525	result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
	526	if (result == 0 )
	527	do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
	528
	529	} else {
	530	struct user32_aiocb aiocb32;
	531
	532	result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
	533	if ( result == 0 )
	534	do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
	535	}
	536
	537	if ( result != 0 ) {
	538	result = EAGAIN;
	539	goto ExitRoutine;
	540	}
	541
	542	/* NOTE - POSIX standard says a mismatch between the file */
	543	/* descriptor passed in and the file descriptor embedded in */
	544	/* the aiocb causes unspecified results. We return EBADF in */
	545	/* that situation. */
	546	if ( uap->fd != my_aiocb.aio_fildes ) {
	547	result = EBADF;
	548	goto ExitRoutine;
	549	}
	550	}
	551
	552	aio_proc_lock(p);
	553	result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
	554	ASSERT_AIO_PROC_LOCK_OWNED(p);
	555	aio_proc_unlock(p);
	556
	557	if ( result != -1 ) {
	558	*retval = result;
	559	result = 0;
	560	goto ExitRoutine;
	561	}
	562
	563	result = EBADF;
	564
	565	ExitRoutine:
	566	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) \| DBG_FUNC_END,
	567	(int)p, (int)uap->aiocbp, result, 0, 0 );
	568
	569	return( result );
	570
	571	} /* aio_cancel */
	572
	573
	574	/*
	575	* _aio_close - internal function used to clean up async IO requests for
	576	* a file descriptor that is closing.
	577	* THIS MAY BLOCK.
	578	*/
	579	__private_extern__ void
	580	_aio_close(proc_t p, int fd )
	581	{
	582	int error;
	583
	584	/* quick check to see if there are any async IO requests queued up */
	585	if (aio_get_all_queues_count() < 1) {
	586	return;
	587	}
	588
	589	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) \| DBG_FUNC_START,
	590	(int)p, fd, 0, 0, 0 );
	591
	592	/* cancel all async IO requests on our todo queues for this file descriptor */
	593	aio_proc_lock(p);
	594	error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
	595	ASSERT_AIO_PROC_LOCK_OWNED(p);
	596	if ( error == AIO_NOTCANCELED ) {
	597	/*
	598	* AIO_NOTCANCELED is returned when we find an aio request for this process
	599	* and file descriptor on the active async IO queue. Active requests cannot
	600	* be cancelled so we must wait for them to complete. We will get a special
	601	* wake up call on our channel used to sleep for ALL active requests to
	602	* complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
	603	* when we must wait for all active aio requests.
	604	*/
	605
	606	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) \| DBG_FUNC_NONE,
	607	(int)p, fd, 0, 0, 0 );
	608
	609	while (aio_proc_active_requests_for_file(p, fd) > 0) {
	610	msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 );
	611	}
	612
	613	}
	614
	615	aio_proc_unlock(p);
	616
	617	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) \| DBG_FUNC_END,
	618	(int)p, fd, 0, 0, 0 );
	619
	620	return;
	621
	622	} /* _aio_close */
	623
	624
	625	/*
	626	* aio_error - return the error status associated with the async IO
	627	* request referred to by uap->aiocbp. The error status is the errno
	628	* value that would be set by the corresponding IO request (read, wrtie,
	629	* fdatasync, or sync).
	630	*/
	631	int
	632	aio_error(proc_t p, struct aio_error_args uap, int retval )
	633	{
	634	aio_workq_entry *entryp;
	635	int error;
	636
	637	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) \| DBG_FUNC_START,
	638	(int)p, (int)uap->aiocbp, 0, 0, 0 );
	639
	640	/* see if there are any aios to check */
	641	if (aio_get_all_queues_count() < 1) {
	642	return EINVAL;
	643	}
	644
	645	aio_proc_lock(p);
	646
	647	/* look for a match on our queue of async IO requests that have completed */
	648	TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
	649	if ( entryp->uaiocbp == uap->aiocbp ) {
	650	ASSERT_AIO_FROM_PROC(entryp, p);
	651
	652	aio_entry_lock_spin(entryp);
	653	*retval = entryp->errorval;
	654	error = 0;
	655	aio_entry_unlock(entryp);
	656	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) \| DBG_FUNC_NONE,
	657	(int)p, (int)uap->aiocbp, *retval, 0, 0 );
	658	goto ExitRoutine;
	659	}
	660	}
	661
	662	/* look for a match on our queue of active async IO requests */
	663	TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
	664	if ( entryp->uaiocbp == uap->aiocbp ) {
	665	ASSERT_AIO_FROM_PROC(entryp, p);
	666	*retval = EINPROGRESS;
	667	error = 0;
	668	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) \| DBG_FUNC_NONE,
	669	(int)p, (int)uap->aiocbp, *retval, 0, 0 );
	670	goto ExitRoutine;
	671	}
	672	}
	673
	674	error = EINVAL;
	675
	676	ExitRoutine:
	677	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) \| DBG_FUNC_END,
	678	(int)p, (int)uap->aiocbp, error, 0, 0 );
	679	aio_proc_unlock(p);
	680
	681	return( error );
	682
	683	} /* aio_error */
	684
	685
	686	/*
	687	* aio_fsync - asynchronously force all IO operations associated
	688	* with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
	689	* queued at the time of the call to the synchronized completion state.
	690	* NOTE - we do not support op O_DSYNC at this point since we do not support the
	691	* fdatasync() call.
	692	*/
	693	int
	694	aio_fsync(proc_t p, struct aio_fsync_args uap, int retval )
	695	{
	696	int error;
	697	int fsync_kind;
	698
	699	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) \| DBG_FUNC_START,
	700	(int)p, (int)uap->aiocbp, uap->op, 0, 0 );
	701
	702	*retval = 0;
	703	/* 0 := O_SYNC for binary backward compatibility with Panther */
	704	if (uap->op == O_SYNC \|\| uap->op == 0)
	705	fsync_kind = AIO_FSYNC;
	706	else if ( uap->op == O_DSYNC )
	707	fsync_kind = AIO_DSYNC;
	708	else {
	709	*retval = -1;
	710	error = EINVAL;
	711	goto ExitRoutine;
	712	}
	713
	714	error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
	715	if ( error != 0 )
	716	*retval = -1;
	717
	718	ExitRoutine:
	719	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) \| DBG_FUNC_END,
	720	(int)p, (int)uap->aiocbp, error, 0, 0 );
	721
	722	return( error );
	723
	724	} /* aio_fsync */
	725
	726
	727	/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
	728	* file descriptor (uap->aiocbp->aio_fildes) into the buffer
	729	* (uap->aiocbp->aio_buf).
	730	*/
	731	int
	732	aio_read(proc_t p, struct aio_read_args uap, int retval )
	733	{
	734	int error;
	735
	736	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) \| DBG_FUNC_START,
	737	(int)p, (int)uap->aiocbp, 0, 0, 0 );
	738
	739	*retval = 0;
	740
	741	error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
	742	if ( error != 0 )
	743	*retval = -1;
	744
	745	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) \| DBG_FUNC_END,
	746	(int)p, (int)uap->aiocbp, error, 0, 0 );
	747
	748	return( error );
	749
	750	} /* aio_read */
	751
	752
	753	/*
	754	* aio_return - return the return status associated with the async IO
	755	* request referred to by uap->aiocbp. The return status is the value
	756	* that would be returned by corresponding IO request (read, write,
	757	* fdatasync, or sync). This is where we release kernel resources
	758	* held for async IO call associated with the given aiocb pointer.
	759	*/
	760	int
	761	aio_return(proc_t p, struct aio_return_args uap, user_ssize_t retval )
	762	{
	763	aio_workq_entry *entryp;
	764	int error;
	765	boolean_t proc_lock_held = FALSE;
	766
	767	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) \| DBG_FUNC_START,
	768	(int)p, (int)uap->aiocbp, 0, 0, 0 );
	769
	770	/* See if there are any entries to check */
	771	if (aio_get_all_queues_count() < 1) {
	772	error = EINVAL;
	773	goto ExitRoutine;
	774	}
	775
	776	aio_proc_lock(p);
	777	proc_lock_held = TRUE;
	778	*retval = 0;
	779
	780	/* look for a match on our queue of async IO requests that have completed */
	781	TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
	782	ASSERT_AIO_FROM_PROC(entryp, p);
	783	if ( entryp->uaiocbp == uap->aiocbp ) {
	784	/* Done and valid for aio_return(), pull it off the list */
	785	aio_proc_remove_done_locked(p, entryp);
	786
	787	/* Drop the proc lock, but keep the entry locked */
	788	aio_entry_lock(entryp);
	789	aio_proc_unlock(p);
	790	proc_lock_held = FALSE;
	791
	792	*retval = entryp->returnval;
	793	error = 0;
	794
	795	/* No references and off all lists, safe to free */
	796	if (entryp->aio_refcount == 0) {
	797	aio_entry_unlock(entryp);
	798	aio_free_request(entryp);
	799	}
	800	else {
	801	/* Whoever has the refcount will have to free it */
	802	entryp->flags \|= AIO_DO_FREE;
	803	aio_entry_unlock(entryp);
	804	}
	805
	806
	807	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) \| DBG_FUNC_NONE,
	808	(int)p, (int)uap->aiocbp, *retval, 0, 0 );
	809	goto ExitRoutine;
	810	}
	811	}
	812
	813	/* look for a match on our queue of active async IO requests */
	814	TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
	815	ASSERT_AIO_FROM_PROC(entryp, p);
	816	if ( entryp->uaiocbp == uap->aiocbp ) {
	817	error = EINPROGRESS;
	818	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) \| DBG_FUNC_NONE,
	819	(int)p, (int)uap->aiocbp, *retval, 0, 0 );
	820	goto ExitRoutine;
	821	}
	822	}
	823
	824	error = EINVAL;
	825
	826	ExitRoutine:
	827	if (proc_lock_held)
	828	aio_proc_unlock(p);
	829	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) \| DBG_FUNC_END,
	830	(int)p, (int)uap->aiocbp, error, 0, 0 );
	831
	832	return( error );
	833
	834	} /* aio_return */
	835
	836
	837	/*
	838	* _aio_exec - internal function used to clean up async IO requests for
	839	* a process that is going away due to exec(). We cancel any async IOs
	840	* we can and wait for those already active. We also disable signaling
	841	* for cancelled or active aio requests that complete.
	842	* This routine MAY block!
	843	*/
	844	__private_extern__ void
	845	_aio_exec(proc_t p )
	846	{
	847
	848	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) \| DBG_FUNC_START,
	849	(int)p, 0, 0, 0, 0 );
	850
	851	_aio_exit( p );
	852
	853	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) \| DBG_FUNC_END,
	854	(int)p, 0, 0, 0, 0 );
	855
	856	return;
	857
	858	} /* _aio_exec */
	859
	860
	861	/*
	862	* _aio_exit - internal function used to clean up async IO requests for
	863	* a process that is terminating (via exit() or exec() ). We cancel any async IOs
	864	* we can and wait for those already active. We also disable signaling
	865	* for cancelled or active aio requests that complete. This routine MAY block!
	866	*/
	867	__private_extern__ void
	868	_aio_exit(proc_t p )
	869	{
	870	int error;
	871	aio_workq_entry *entryp;
	872
	873
	874	/* quick check to see if there are any async IO requests queued up */
	875	if (aio_get_all_queues_count() < 1) {
	876	return;
	877	}
	878
	879	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) \| DBG_FUNC_START,
	880	(int)p, 0, 0, 0, 0 );
	881
	882	aio_proc_lock(p);
	883
	884	/*
	885	* cancel async IO requests on the todo work queue and wait for those
	886	* already active to complete.
	887	*/
	888	error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
	889	ASSERT_AIO_PROC_LOCK_OWNED(p);
	890	if ( error == AIO_NOTCANCELED ) {
	891	/*
	892	* AIO_NOTCANCELED is returned when we find an aio request for this process
	893	* on the active async IO queue. Active requests cannot be cancelled so we
	894	* must wait for them to complete. We will get a special wake up call on
	895	* our channel used to sleep for ALL active requests to complete. This sleep
	896	* channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
	897	* active aio requests.
	898	*/
	899
	900	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) \| DBG_FUNC_NONE,
	901	(int)p, 0, 0, 0, 0 );
	902
	903	while (p->p_aio_active_count != 0) {
	904	msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
	905	}
	906	}
	907
	908	if (p->p_aio_active_count != 0) {
	909	panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
	910	}
	911
	912	/* release all aio resources used by this process */
	913	entryp = TAILQ_FIRST( &p->p_aio_doneq );
	914	while ( entryp != NULL ) {
	915	ASSERT_AIO_FROM_PROC(entryp, p);
	916	aio_workq_entry *next_entryp;
	917
	918	next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
	919	aio_proc_remove_done_locked(p, entryp);
	920
	921	/* we cannot free requests that are still completing */
	922	aio_entry_lock_spin(entryp);
	923	if (entryp->aio_refcount == 0) {
	924	aio_proc_unlock(p);
	925	aio_entry_unlock(entryp);
	926	aio_free_request(entryp);
	927
	928	/* need to start over since aio_doneq may have been */
	929	/* changed while we were away. */
	930	aio_proc_lock(p);
	931	entryp = TAILQ_FIRST( &p->p_aio_doneq );
	932	continue;
	933	}
	934	else {
	935	/* whoever has the reference will have to do the free */
	936	entryp->flags \|= AIO_DO_FREE;
	937	}
	938
	939	aio_entry_unlock(entryp);
	940	entryp = next_entryp;
	941	}
	942
	943	aio_proc_unlock(p);
	944
	945	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) \| DBG_FUNC_END,
	946	(int)p, 0, 0, 0, 0 );
	947	return;
	948
	949	} /* _aio_exit */
	950
	951
	952	static boolean_t
	953	should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
	954	{
	955	if ( (aiocbp == USER_ADDR_NULL && fd == 0) \|\|
	956	(aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) \|\|
	957	(aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
	958	return TRUE;
	959	}
	960
	961	return FALSE;
	962	}
	963
	964	/*
	965	* do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
	966	* aio_cancel, close, and at exit.
	967	* There are three modes of operation: 1) cancel all async IOs for a process -
	968	* fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
	969	* is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
	970	* aiocbp.
	971	* Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
	972	* target async IO requests, AIO_NOTCANCELED if we could not cancel all
	973	* target async IO requests, and AIO_ALLDONE if all target async IO requests
	974	* were already complete.
	975	* WARNING - do not deference aiocbp in this routine, it may point to user
	976	* land data that has not been copied in (when called from aio_cancel() )
	977	*
	978	* Called with proc locked, and returns the same way.
	979	*/
	980	static int
	981	do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
	982	int wait_for_completion, boolean_t disable_notification )
	983	{
	984	ASSERT_AIO_PROC_LOCK_OWNED(p);
	985
	986	aio_workq_entry *entryp;
	987	int result;
	988
	989	result = -1;
	990
	991	/* look for a match on our queue of async todo work. */
	992	entryp = TAILQ_FIRST(&p->p_aio_activeq);
	993	while ( entryp != NULL ) {
	994	ASSERT_AIO_FROM_PROC(entryp, p);
	995	aio_workq_entry *next_entryp;
	996
	997	next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
	998	if (!should_cancel(entryp, aiocbp, fd)) {
	999	entryp = next_entryp;
	1000	continue;
	1001	}
	1002
	1003	/* Can only be cancelled if it's still on a work queue */
	1004	if (aio_entry_try_workq_remove(entryp) != 0) {
	1005	/* Have removed from workq. Update entry state and take a ref */
	1006	aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
	1007
	1008	/* Put on the proc done queue and update counts, then unlock the proc */
	1009	aio_proc_move_done_locked(p, entryp);
	1010	aio_proc_unlock(p);
	1011
	1012	/* Now it's officially cancelled. Do the completion */
	1013	result = AIO_CANCELED;
	1014	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) \| DBG_FUNC_NONE,
	1015	(int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
	1016	do_aio_completion(entryp);
	1017
	1018	/* This will free if the aio_return() has already happened ... */
	1019	aio_entry_unref(entryp);
	1020	aio_proc_lock(p);
	1021
	1022	if ( aiocbp != USER_ADDR_NULL ) {
	1023	return( result );
	1024	}
	1025
	1026	/*
	1027	* Restart from the head of the proc active queue since it
	1028	* may have been changed while we were away doing completion
	1029	* processing.
	1030	*
	1031	* Note that if we found an uncancellable AIO before, we will
	1032	* either find it again or discover that it's been completed,
	1033	* so resetting the result will not cause us to return success
	1034	* despite outstanding AIOs.
	1035	*/
	1036	entryp = TAILQ_FIRST(&p->p_aio_activeq);
	1037	result = -1; /* As if beginning anew */
	1038	} else {
	1039	/*
	1040	* It's been taken off the active queue already, i.e. is in flight.
	1041	* All we can do is ask for notification.
	1042	*/
	1043	result = AIO_NOTCANCELED;
	1044
	1045	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) \| DBG_FUNC_NONE,
	1046	(int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
	1047
	1048	/* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
	1049	aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
	1050
	1051	if ( aiocbp != USER_ADDR_NULL ) {
	1052	return( result );
	1053	}
	1054	entryp = next_entryp;
	1055	}
	1056	} /* while... */
	1057
	1058	/*
	1059	* if we didn't find any matches on the todo or active queues then look for a
	1060	* match on our queue of async IO requests that have completed and if found
	1061	* return AIO_ALLDONE result.
	1062	*
	1063	* Proc AIO lock is still held.
	1064	*/
	1065	if ( result == -1 ) {
	1066	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
	1067	ASSERT_AIO_FROM_PROC(entryp, p);
	1068	if (should_cancel(entryp, aiocbp, fd)) {
	1069	result = AIO_ALLDONE;
	1070	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) \| DBG_FUNC_NONE,
	1071	(int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
	1072
	1073	if ( aiocbp != USER_ADDR_NULL ) {
	1074	return( result );
	1075	}
	1076	}
	1077	}
	1078	}
	1079
	1080	return( result );
	1081
	1082	}
	1083	/* do_aio_cancel_locked */
	1084
	1085
	1086	/*
	1087	* aio_suspend - suspend the calling thread until at least one of the async
	1088	* IO operations referenced by uap->aiocblist has completed, until a signal
	1089	* interrupts the function, or uap->timeoutp time interval (optional) has
	1090	* passed.
	1091	* Returns 0 if one or more async IOs have completed else -1 and errno is
	1092	* set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
	1093	* woke us up.
	1094	*/
	1095	int
	1096	aio_suspend(proc_t p, struct aio_suspend_args uap, int retval )
	1097	{
	1098	__pthread_testcancel(1);
	1099	return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
	1100	}
	1101
	1102
	1103	int
	1104	aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args uap, int retval )
	1105	{
	1106	int error;
	1107	int i, count;
	1108	uint64_t abstime;
	1109	struct user_timespec ts;
	1110	aio_workq_entry *entryp;
	1111	user_addr_t *aiocbpp;
	1112
	1113	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) \| DBG_FUNC_START,
	1114	(int)p, uap->nent, 0, 0, 0 );
	1115
	1116	*retval = -1;
	1117	abstime = 0;
	1118	aiocbpp = NULL;
	1119
	1120	count = aio_get_all_queues_count( );
	1121	if ( count < 1 ) {
	1122	error = EINVAL;
	1123	goto ExitThisRoutine;
	1124	}
	1125
	1126	if ( uap->nent < 1 \|\| uap->nent > aio_max_requests_per_process ) {
	1127	error = EINVAL;
	1128	goto ExitThisRoutine;
	1129	}
	1130
	1131	if ( uap->timeoutp != USER_ADDR_NULL ) {
	1132	if ( proc_is64bit(p) ) {
	1133	struct user64_timespec temp;
	1134	error = copyin( uap->timeoutp, &temp, sizeof(temp) );
	1135	if ( error == 0 ) {
	1136	ts.tv_sec = temp.tv_sec;
	1137	ts.tv_nsec = temp.tv_nsec;
	1138	}
	1139	}
	1140	else {
	1141	struct user32_timespec temp;
	1142	error = copyin( uap->timeoutp, &temp, sizeof(temp) );
	1143	if ( error == 0 ) {
	1144	ts.tv_sec = temp.tv_sec;
	1145	ts.tv_nsec = temp.tv_nsec;
	1146	}
	1147	}
	1148	if ( error != 0 ) {
	1149	error = EAGAIN;
	1150	goto ExitThisRoutine;
	1151	}
	1152
	1153	if ( ts.tv_sec < 0 \|\| ts.tv_nsec < 0 \|\| ts.tv_nsec >= 1000000000 ) {
	1154	error = EINVAL;
	1155	goto ExitThisRoutine;
	1156	}
	1157
	1158	nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
	1159	&abstime );
	1160	clock_absolutetime_interval_to_deadline( abstime, &abstime );
	1161	}
	1162
	1163	aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
	1164	if ( aiocbpp == NULL ) {
	1165	error = EAGAIN;
	1166	goto ExitThisRoutine;
	1167	}
	1168
	1169	/* check list of aio requests to see if any have completed */
	1170	check_for_our_aiocbp:
	1171	aio_proc_lock_spin(p);
	1172	for ( i = 0; i < uap->nent; i++ ) {
	1173	user_addr_t aiocbp;
	1174
	1175	/* NULL elements are legal so check for 'em */
	1176	aiocbp = *(aiocbpp + i);
	1177	if ( aiocbp == USER_ADDR_NULL )
	1178	continue;
	1179
	1180	/* return immediately if any aio request in the list is done */
	1181	TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
	1182	ASSERT_AIO_FROM_PROC(entryp, p);
	1183	if ( entryp->uaiocbp == aiocbp ) {
	1184	aio_proc_unlock(p);
	1185	*retval = 0;
	1186	error = 0;
	1187	goto ExitThisRoutine;
	1188	}
	1189	}
	1190	} /* for ( ; i < uap->nent; ) */
	1191
	1192	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) \| DBG_FUNC_NONE,
	1193	(int)p, uap->nent, 0, 0, 0 );
	1194
	1195	/*
	1196	* wait for an async IO to complete or a signal fires or timeout expires.
	1197	* we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
	1198	* interrupts us. If an async IO completes before a signal fires or our
	1199	* timeout expires, we get a wakeup call from aio_work_thread().
	1200	*/
	1201
	1202	error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH \| PWAIT \| PDROP, "aio_suspend", abstime); /* XXX better priority? */
	1203	if ( error == 0 ) {
	1204	/*
	1205	* got our wakeup call from aio_work_thread().
	1206	* Since we can get a wakeup on this channel from another thread in the
	1207	* same process we head back up to make sure this is for the correct aiocbp.
	1208	* If it is the correct aiocbp we will return from where we do the check
	1209	* (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
	1210	* else we will fall out and just sleep again.
	1211	*/
	1212	goto check_for_our_aiocbp;
	1213	}
	1214	else if ( error == EWOULDBLOCK ) {
	1215	/* our timeout expired */
	1216	error = EAGAIN;
	1217	}
	1218	else {
	1219	/* we were interrupted */
	1220	error = EINTR;
	1221	}
	1222
	1223	ExitThisRoutine:
	1224	if ( aiocbpp != NULL )
	1225	FREE( aiocbpp, M_TEMP );
	1226
	1227	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) \| DBG_FUNC_END,
	1228	(int)p, uap->nent, error, 0, 0 );
	1229
	1230	return( error );
	1231
	1232	} /* aio_suspend */
	1233
	1234
	1235	/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
	1236	* file descriptor (uap->aiocbp->aio_fildes) from the buffer
	1237	* (uap->aiocbp->aio_buf).
	1238	*/
	1239
	1240	int
	1241	aio_write(proc_t p, struct aio_write_args uap, int retval )
	1242	{
	1243	int error;
	1244
	1245	*retval = 0;
	1246
	1247	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) \| DBG_FUNC_START,
	1248	(int)p, (int)uap->aiocbp, 0, 0, 0 );
	1249
	1250	error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
	1251	if ( error != 0 )
	1252	*retval = -1;
	1253
	1254	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) \| DBG_FUNC_END,
	1255	(int)p, (int)uap->aiocbp, error, 0, 0 );
	1256
	1257	return( error );
	1258
	1259	} /* aio_write */
	1260
	1261
	1262	static user_addr_t *
	1263	aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
	1264	{
	1265	user_addr_t *aiocbpp;
	1266	int i, result;
	1267
	1268	/* we reserve enough space for largest possible pointer size */
	1269	MALLOC( aiocbpp, user_addr_t , (nent sizeof(user_addr_t)), M_TEMP, M_WAITOK );
	1270	if ( aiocbpp == NULL )
	1271	goto err;
	1272
	1273	/* copyin our aiocb pointers from list */
	1274	result = copyin( aiocblist, aiocbpp,
	1275	proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
	1276	: (nent * sizeof(user32_addr_t)) );
	1277	if ( result) {
	1278	FREE( aiocbpp, M_TEMP );
	1279	aiocbpp = NULL;
	1280	goto err;
	1281	}
	1282
	1283	/*
	1284	* We depend on a list of user_addr_t's so we need to
	1285	* munge and expand when these pointers came from a
	1286	* 32-bit process
	1287	*/
	1288	if ( !proc_is64bit(procp) ) {
	1289	/* copy from last to first to deal with overlap */
	1290	user32_addr_t my_ptrp = ((user32_addr_t )aiocbpp) + (nent - 1);
	1291	user_addr_t *my_addrp = aiocbpp + (nent - 1);
	1292
	1293	for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
	1294	my_addrp = (user_addr_t) (my_ptrp);
	1295	}
	1296	}
	1297
	1298	err:
	1299	return (aiocbpp);
	1300	}
	1301
	1302
	1303	static int
	1304	aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
	1305	{
	1306	int result = 0;
	1307
	1308	if (sigp == USER_ADDR_NULL)
	1309	goto out;
	1310
	1311	/*
	1312	* We need to munge aio_sigevent since it contains pointers.
	1313	* Since we do not know if sigev_value is an int or a ptr we do
	1314	* NOT cast the ptr to a user_addr_t. This means if we send
	1315	* this info back to user space we need to remember sigev_value
	1316	* was not expanded for the 32-bit case.
	1317	*
	1318	* Notes: This does NOT affect us since we don't support
	1319	* sigev_value yet in the aio context.
	1320	*/
	1321	if ( proc_is64bit(procp) ) {
	1322	struct user64_sigevent sigevent64;
	1323
	1324	result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
	1325	if ( result == 0 ) {
	1326	sigev->sigev_notify = sigevent64.sigev_notify;
	1327	sigev->sigev_signo = sigevent64.sigev_signo;
	1328	sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
	1329	sigev->sigev_notify_function = sigevent64.sigev_notify_function;
	1330	sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
	1331	}
	1332
	1333	} else {
	1334	struct user32_sigevent sigevent32;
	1335
	1336	result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
	1337	if ( result == 0 ) {
	1338	sigev->sigev_notify = sigevent32.sigev_notify;
	1339	sigev->sigev_signo = sigevent32.sigev_signo;
	1340	sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
	1341	sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
	1342	sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
	1343	}
	1344	}
	1345
	1346	if ( result != 0 ) {
	1347	result = EAGAIN;
	1348	}
	1349
	1350	out:
	1351	return (result);
	1352	}
	1353
	1354	/*
	1355	* aio_enqueue_work
	1356	*
	1357	* Queue up the entry on the aio asynchronous work queue in priority order
	1358	* based on the relative priority of the request. We calculate the relative
	1359	* priority using the nice value of the caller and the value
	1360	*
	1361	* Parameters: procp Process queueing the I/O
	1362	* entryp The work queue entry being queued
	1363	*
	1364	* Returns: (void) No failure modes
	1365	*
	1366	* Notes: This function is used for both lio_listio and aio
	1367	*
	1368	* XXX: At some point, we may have to consider thread priority
	1369	* rather than process priority, but we don't maintain the
	1370	* adjusted priority for threads the POSIX way.
	1371	*
	1372	*
	1373	* Called with proc locked.
	1374	*/
	1375	static void
	1376	aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
	1377	{
	1378	#if 0
	1379	aio_workq_entry my_entryp; / used for insertion sort */
	1380	#endif /* 0 */
	1381	aio_workq_t queue = aio_entry_workq(entryp);
	1382
	1383	if (proc_locked == 0) {
	1384	aio_proc_lock(procp);
	1385	}
	1386
	1387	ASSERT_AIO_PROC_LOCK_OWNED(procp);
	1388
	1389	/* Onto proc queue */
	1390	TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
	1391	procp->p_aio_active_count++;
	1392	procp->p_aio_total_count++;
	1393
	1394	/* And work queue */
	1395	aio_workq_lock_spin(queue);
	1396	aio_workq_add_entry_locked(queue, entryp);
	1397	waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
	1398	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
	1399	aio_workq_unlock(queue);
	1400
	1401	if (proc_locked == 0) {
	1402	aio_proc_unlock(procp);
	1403	}
	1404
	1405	#if 0
	1406	/*
	1407	* Procedure:
	1408	*
	1409	* (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
	1410	* (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
	1411	* which is [0..39], with 0 not being used. In nice values, the
	1412	* lower the nice value, the higher the priority.
	1413	* (3) The normalized scheduling prioritiy is the highest nice value
	1414	* minus the current nice value. In I/O scheduling priority, the
	1415	* higher the value the lower the priority, so it is the inverse
	1416	* of the nice value (the higher the number, the higher the I/O
	1417	* priority).
	1418	* (4) From the normalized scheduling priority, we subtract the
	1419	* request priority to get the request priority value number;
	1420	* this means that requests are only capable of depressing their
	1421	* priority relative to other requests,
	1422	*/
	1423	entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
	1424
	1425	/* only premit depressing the priority */
	1426	if (entryp->aiocb.aio_reqprio < 0)
	1427	entryp->aiocb.aio_reqprio = 0;
	1428	if (entryp->aiocb.aio_reqprio > 0) {
	1429	entryp->priority -= entryp->aiocb.aio_reqprio;
	1430	if (entryp->priority < 0)
	1431	entryp->priority = 0;
	1432	}
	1433
	1434	/* Insertion sort the entry; lowest ->priority to highest */
	1435	TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
	1436	if ( entryp->priority <= my_entryp->priority) {
	1437	TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
	1438	break;
	1439	}
	1440	}
	1441	if (my_entryp == NULL)
	1442	TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
	1443	#endif /* 0 */
	1444	}
	1445
	1446
	1447	/*
	1448	* lio_listio - initiate a list of IO requests. We process the list of
	1449	* aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
	1450	* (mode == LIO_NOWAIT).
	1451	*
	1452	* The caller gets error and return status for each aiocb in the list
	1453	* via aio_error and aio_return. We must keep completed requests until
	1454	* released by the aio_return call.
	1455	*/
	1456	int
	1457	lio_listio(proc_t p, struct lio_listio_args uap, int retval )
	1458	{
	1459	int i;
	1460	int call_result;
	1461	int result;
	1462	int old_count;
	1463	aio_workq_entry **entryp_listp;
	1464	user_addr_t *aiocbpp;
	1465	struct user_sigevent aiosigev;
	1466	aio_lio_context *lio_context;
	1467	boolean_t free_context = FALSE;
	1468	uint32_t *paio_offset;
	1469	uint32_t *paio_nbytes;
	1470
	1471	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) \| DBG_FUNC_START,
	1472	(int)p, uap->nent, uap->mode, 0, 0 );
	1473
	1474	entryp_listp = NULL;
	1475	lio_context = NULL;
	1476	aiocbpp = NULL;
	1477	call_result = -1;
	1478	*retval = -1;
	1479	if ( !(uap->mode == LIO_NOWAIT \|\| uap->mode == LIO_WAIT) ) {
	1480	call_result = EINVAL;
	1481	goto ExitRoutine;
	1482	}
	1483
	1484	if ( uap->nent < 1 \|\| uap->nent > AIO_LISTIO_MAX ) {
	1485	call_result = EINVAL;
	1486	goto ExitRoutine;
	1487	}
	1488
	1489	/*
	1490	* allocate a list of aio_workq_entry pointers that we will use
	1491	* to queue up all our requests at once while holding our lock.
	1492	*/
	1493	MALLOC( entryp_listp, void , (uap->nent sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
	1494	if ( entryp_listp == NULL ) {
	1495	call_result = EAGAIN;
	1496	goto ExitRoutine;
	1497	}
	1498
	1499	MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
	1500	if ( lio_context == NULL ) {
	1501	call_result = EAGAIN;
	1502	goto ExitRoutine;
	1503	}
	1504
	1505	#if DEBUG
	1506	OSIncrementAtomic(&lio_contexts_alloced);
	1507	#endif /* DEBUG */
	1508
	1509	bzero(lio_context, sizeof(aio_lio_context));
	1510
	1511	aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
	1512	if ( aiocbpp == NULL ) {
	1513	call_result = EAGAIN;
	1514	goto ExitRoutine;
	1515	}
	1516
	1517	/*
	1518	* Use sigevent passed in to lio_listio for each of our calls, but
	1519	* only do completion notification after the last request completes.
	1520	*/
	1521	bzero(&aiosigev, sizeof(aiosigev));
	1522	/* Only copy in an sigev if the user supplied one */
	1523	if (uap->sigp != USER_ADDR_NULL) {
	1524	call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
	1525	if ( call_result)
	1526	goto ExitRoutine;
	1527	}
	1528
	1529	/* process list of aio requests */
	1530	lio_context->io_issued = uap->nent;
	1531	lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
	1532	for ( i = 0; i < uap->nent; i++ ) {
	1533	user_addr_t my_aiocbp;
	1534	aio_workq_entry *entryp;
	1535
	1536	*(entryp_listp + i) = NULL;
	1537	my_aiocbp = *(aiocbpp + i);
	1538
	1539	/* NULL elements are legal so check for 'em */
	1540	if ( my_aiocbp == USER_ADDR_NULL ) {
	1541	aio_proc_lock_spin(p);
	1542	lio_context->io_issued--;
	1543	aio_proc_unlock(p);
	1544	continue;
	1545	}
	1546
	1547	/*
	1548	* We use lio_context to mark IO requests for delayed completion
	1549	* processing which means we wait until all IO requests in the
	1550	* group have completed before we either return to the caller
	1551	* when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
	1552	*
	1553	* We use the address of the lio_context for this, since it is
	1554	* unique in the address space.
	1555	*/
	1556	result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
	1557	if ( result != 0 && call_result == -1 )
	1558	call_result = result;
	1559
	1560	/* NULL elements are legal so check for 'em */
	1561	entryp = *(entryp_listp + i);
	1562	if ( entryp == NULL ) {
	1563	aio_proc_lock_spin(p);
	1564	lio_context->io_issued--;
	1565	aio_proc_unlock(p);
	1566	continue;
	1567	}
	1568
	1569	if ( uap->mode == LIO_NOWAIT ) {
	1570	/* Set signal hander, if any */
	1571	entryp->aiocb.aio_sigevent = aiosigev;
	1572	} else {
	1573	/* flag that this thread blocks pending completion */
	1574	entryp->flags \|= AIO_LIO_NOTIFY;
	1575	}
	1576
	1577	/* check our aio limits to throttle bad or rude user land behavior */
	1578	old_count = aio_increment_total_count();
	1579
	1580	aio_proc_lock_spin(p);
	1581	if ( old_count >= aio_max_requests \|\|
	1582	aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process \|\|
	1583	is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
	1584
	1585	lio_context->io_issued--;
	1586	aio_proc_unlock(p);
	1587
	1588	aio_decrement_total_count();
	1589
	1590	if ( call_result == -1 )
	1591	call_result = EAGAIN;
	1592	aio_free_request(entryp);
	1593	entryp_listp[i] = NULL;
	1594	continue;
	1595	}
	1596
	1597	lck_mtx_convert_spin(aio_proc_mutex(p));
	1598	aio_enqueue_work(p, entryp, 1);
	1599	aio_proc_unlock(p);
	1600
	1601	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) \| DBG_FUNC_START,
	1602	(int)p, (int)entryp->uaiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
	1603	paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
	1604	paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
	1605	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) \| DBG_FUNC_END,
	1606	paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
	1607	paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
	1608	0 );
	1609	}
	1610
	1611	switch(uap->mode) {
	1612	case LIO_WAIT:
	1613	aio_proc_lock_spin(p);
	1614	while (lio_context->io_completed < lio_context->io_issued) {
	1615	result = msleep(lio_context, aio_proc_mutex(p), PCATCH \| PRIBIO \| PSPIN, "lio_listio", 0);
	1616
	1617	/* If we were interrupted, fail out (even if all finished) */
	1618	if (result != 0) {
	1619	call_result = EINTR;
	1620	lio_context->io_waiter = 0;
	1621	break;
	1622	}
	1623	}
	1624
	1625	/* If all IOs have finished must free it */
	1626	if (lio_context->io_completed == lio_context->io_issued) {
	1627	free_context = TRUE;
	1628	}
	1629
	1630	aio_proc_unlock(p);
	1631	break;
	1632
	1633	case LIO_NOWAIT:
	1634	break;
	1635	}
	1636
	1637	/* call_result == -1 means we had no trouble queueing up requests */
	1638	if ( call_result == -1 ) {
	1639	call_result = 0;
	1640	*retval = 0;
	1641	}
	1642
	1643	ExitRoutine:
	1644	if ( entryp_listp != NULL )
	1645	FREE( entryp_listp, M_TEMP );
	1646	if ( aiocbpp != NULL )
	1647	FREE( aiocbpp, M_TEMP );
	1648	if ((lio_context != NULL) && ((lio_context->io_issued == 0) \|\| (free_context == TRUE))) {
	1649	free_lio_context(lio_context);
	1650	}
	1651
	1652	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) \| DBG_FUNC_END,
	1653	(int)p, call_result, 0, 0, 0 );
	1654
	1655	return( call_result );
	1656
	1657	} /* lio_listio */
	1658
	1659
	1660	/*
	1661	* aio worker thread. this is where all the real work gets done.
	1662	* we get a wake up call on sleep channel &aio_anchor.aio_async_workq
	1663	* after new work is queued up.
	1664	*/
	1665	__attribute__((noreturn))
	1666	static void
	1667	aio_work_thread(void)
	1668	{
	1669	aio_workq_entry *entryp;
	1670	int error;
	1671	vm_map_t currentmap;
	1672	vm_map_t oldmap = VM_MAP_NULL;
	1673	task_t oldaiotask = TASK_NULL;
	1674	struct uthread *uthreadp = NULL;
	1675
	1676	for( ;; ) {
	1677	/*
	1678	* returns with the entry ref'ed.
	1679	* sleeps until work is available.
	1680	*/
	1681	entryp = aio_get_some_work();
	1682
	1683	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) \| DBG_FUNC_START,
	1684	(int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
	1685
	1686	/*
	1687	* Assume the target's address space identity for the duration
	1688	* of the IO. Note: don't need to have the entryp locked,
	1689	* because the proc and map don't change until it's freed.
	1690	*/
	1691	currentmap = get_task_map( (current_proc())->task );
	1692	if ( currentmap != entryp->aio_map ) {
	1693	uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
	1694	oldaiotask = uthreadp->uu_aio_task;
	1695	uthreadp->uu_aio_task = entryp->procp->task;
	1696	oldmap = vm_map_switch( entryp->aio_map );
	1697	}
	1698
	1699	if ( (entryp->flags & AIO_READ) != 0 ) {
	1700	error = do_aio_read( entryp );
	1701	}
	1702	else if ( (entryp->flags & AIO_WRITE) != 0 ) {
	1703	error = do_aio_write( entryp );
	1704	}
	1705	else if ( (entryp->flags & (AIO_FSYNC \| AIO_DSYNC)) != 0 ) {
	1706	error = do_aio_fsync( entryp );
	1707	}
	1708	else {
	1709	printf( "%s - unknown aio request - flags 0x%02X \n",
	1710	__FUNCTION__, entryp->flags );
	1711	error = EINVAL;
	1712	}
	1713
	1714	/* Restore old map */
	1715	if ( currentmap != entryp->aio_map ) {
	1716	(void) vm_map_switch( oldmap );
	1717	uthreadp->uu_aio_task = oldaiotask;
	1718	}
	1719
	1720	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) \| DBG_FUNC_END,
	1721	(int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
	1722	entryp->returnval, 0 );
	1723
	1724
	1725	/* XXX COUNTS */
	1726	aio_entry_lock_spin(entryp);
	1727	entryp->errorval = error;
	1728	aio_entry_unlock(entryp);
	1729
	1730	/* we're done with the IO request so pop it off the active queue and */
	1731	/* push it on the done queue */
	1732	aio_proc_lock(entryp->procp);
	1733	aio_proc_move_done_locked(entryp->procp, entryp);
	1734	aio_proc_unlock(entryp->procp);
	1735
	1736	OSDecrementAtomic(&aio_anchor.aio_inflight_count);
	1737
	1738	/* remove our reference to the user land map. */
	1739	if ( VM_MAP_NULL != entryp->aio_map ) {
	1740	vm_map_t my_map;
	1741
	1742	my_map = entryp->aio_map;
	1743	entryp->aio_map = VM_MAP_NULL;
	1744	vm_map_deallocate( my_map );
	1745	}
	1746
	1747	/* Provide notifications */
	1748	do_aio_completion( entryp );
	1749
	1750	/* Will free if needed */
	1751	aio_entry_unref(entryp);
	1752
	1753	} /* for ( ;; ) */
	1754
	1755	/* NOT REACHED */
	1756
	1757	} /* aio_work_thread */
	1758
	1759
	1760	/*
	1761	* aio_get_some_work - get the next async IO request that is ready to be executed.
	1762	* aio_fsync complicates matters a bit since we cannot do the fsync until all async
	1763	* IO requests at the time the aio_fsync call came in have completed.
	1764	* NOTE - AIO_LOCK must be held by caller
	1765	*/
	1766	static aio_workq_entry *
	1767	aio_get_some_work( void )
	1768	{
	1769	aio_workq_entry *entryp = NULL;
	1770	aio_workq_t queue = NULL;
	1771
	1772	/* Just one queue for the moment. In the future there will be many. */
	1773	queue = &aio_anchor.aio_async_workqs[0];
	1774	aio_workq_lock_spin(queue);
	1775	if (queue->aioq_count == 0) {
	1776	goto nowork;
	1777	}
	1778
	1779	/*
	1780	* Hold the queue lock.
	1781	*
	1782	* pop some work off the work queue and add to our active queue
	1783	* Always start with the queue lock held.
	1784	*/
	1785	for(;;) {
	1786	/*
	1787	* Pull of of work queue. Once it's off, it can't be cancelled,
	1788	* so we can take our ref once we drop the queue lock.
	1789	*/
	1790	entryp = TAILQ_FIRST(&queue->aioq_entries);
	1791
	1792	/*
	1793	* If there's no work or only fsyncs that need delay, go to sleep
	1794	* and then start anew from aio_work_thread
	1795	*/
	1796	if (entryp == NULL) {
	1797	goto nowork;
	1798	}
	1799
	1800	aio_workq_remove_entry_locked(queue, entryp);
	1801
	1802	aio_workq_unlock(queue);
	1803
	1804	/*
	1805	* Check if it's an fsync that must be delayed. No need to lock the entry;
	1806	* that flag would have been set at initialization.
	1807	*/
	1808	if ( (entryp->flags & AIO_FSYNC) != 0 ) {
	1809	/*
	1810	* Check for unfinished operations on the same file
	1811	* in this proc's queue.
	1812	*/
	1813	aio_proc_lock_spin(entryp->procp);
	1814	if ( aio_delay_fsync_request( entryp ) ) {
	1815	/* It needs to be delayed. Put it back on the end of the work queue */
	1816	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) \| DBG_FUNC_NONE,
	1817	(int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
	1818
	1819	aio_proc_unlock(entryp->procp);
	1820
	1821	aio_workq_lock_spin(queue);
	1822	aio_workq_add_entry_locked(queue, entryp);
	1823	continue;
	1824	}
	1825	aio_proc_unlock(entryp->procp);
	1826	}
	1827
	1828	break;
	1829	}
	1830
	1831	aio_entry_ref(entryp);
	1832
	1833	OSIncrementAtomic(&aio_anchor.aio_inflight_count);
	1834	return( entryp );
	1835
	1836	nowork:
	1837	/* We will wake up when someone enqueues something */
	1838	waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
	1839	aio_workq_unlock(queue);
	1840	thread_block( (thread_continue_t)aio_work_thread );
	1841
	1842	// notreached
	1843	return NULL;
	1844	}
	1845
	1846	/*
	1847	* aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
	1848	* A big, simple hammer: only send it off if it's the most recently filed IO which has
	1849	* not been completed.
	1850	*/
	1851	static boolean_t
	1852	aio_delay_fsync_request( aio_workq_entry *entryp )
	1853	{
	1854	if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
	1855	return FALSE;
	1856	}
	1857
	1858	return TRUE;
	1859	} /* aio_delay_fsync_request */
	1860
	1861	static aio_workq_entry *
	1862	aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
	1863	{
	1864	aio_workq_entry *entryp;
	1865	int result = 0;
	1866
	1867	entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
	1868	if ( entryp == NULL ) {
	1869	result = EAGAIN;
	1870	goto error_exit;
	1871	}
	1872
	1873	bzero( entryp, sizeof(*entryp) );
	1874
	1875	/* fill in the rest of the aio_workq_entry */
	1876	entryp->procp = procp;
	1877	entryp->uaiocbp = aiocbp;
	1878	entryp->flags \|= kindOfIO;
	1879	entryp->group_tag = group_tag;
	1880	entryp->aio_map = VM_MAP_NULL;
	1881	entryp->aio_refcount = 0;
	1882
	1883	if ( proc_is64bit(procp) ) {
	1884	struct user64_aiocb aiocb64;
	1885
	1886	result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
	1887	if (result == 0 )
	1888	do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
	1889
	1890	} else {
	1891	struct user32_aiocb aiocb32;
	1892
	1893	result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
	1894	if ( result == 0 )
	1895	do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
	1896	}
	1897
	1898	if ( result != 0 ) {
	1899	result = EAGAIN;
	1900	goto error_exit;
	1901	}
	1902
	1903	/* get a reference to the user land map in order to keep it around */
	1904	entryp->aio_map = get_task_map( procp->task );
	1905	vm_map_reference( entryp->aio_map );
	1906
	1907	/* do some more validation on the aiocb and embedded file descriptor */
	1908	result = aio_validate( entryp );
	1909	if ( result != 0 )
	1910	goto error_exit_with_ref;
	1911
	1912	/* get a reference on the current_thread, which is passed in vfs_context. */
	1913	entryp->thread = current_thread();
	1914	thread_reference( entryp->thread );
	1915	return ( entryp );
	1916
	1917	error_exit_with_ref:
	1918	if ( VM_MAP_NULL != entryp->aio_map ) {
	1919	vm_map_deallocate( entryp->aio_map );
	1920	}
	1921	error_exit:
	1922	if ( result && entryp != NULL ) {
	1923	zfree( aio_workq_zonep, entryp );
	1924	entryp = NULL;
	1925	}
	1926
	1927	return ( entryp );
	1928	}
	1929
	1930
	1931	/*
	1932	* aio_queue_async_request - queue up an async IO request on our work queue then
	1933	* wake up one of our worker threads to do the actual work. We get a reference
	1934	* to our caller's user land map in order to keep it around while we are
	1935	* processing the request.
	1936	*/
	1937	static int
	1938	aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
	1939	{
	1940	aio_workq_entry *entryp;
	1941	int result;
	1942	int old_count;
	1943	uint32_t *paio_offset;
	1944	uint32_t *paio_nbytes;
	1945
	1946	old_count = aio_increment_total_count();
	1947	if (old_count >= aio_max_requests) {
	1948	result = EAGAIN;
	1949	goto error_noalloc;
	1950	}
	1951
	1952	entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
	1953	if ( entryp == NULL ) {
	1954	result = EAGAIN;
	1955	goto error_noalloc;
	1956	}
	1957
	1958
	1959	aio_proc_lock_spin(procp);
	1960
	1961	if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
	1962	result = EAGAIN;
	1963	goto error_exit;
	1964	}
	1965
	1966	/* check our aio limits to throttle bad or rude user land behavior */
	1967	if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
	1968	printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
	1969	result = EAGAIN;
	1970	goto error_exit;
	1971	}
	1972
	1973	/* Add the IO to proc and work queues, wake up threads as appropriate */
	1974	lck_mtx_convert_spin(aio_proc_mutex(procp));
	1975	aio_enqueue_work(procp, entryp, 1);
	1976
	1977	aio_proc_unlock(procp);
	1978
	1979	paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
	1980	paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
	1981	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) \| DBG_FUNC_START,
	1982	(int)procp, (int)aiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
	1983	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) \| DBG_FUNC_END,
	1984	paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
	1985	paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
	1986	0 );
	1987
	1988	return( 0 );
	1989
	1990	error_exit:
	1991	/*
	1992	* This entry has not been queued up so no worries about
	1993	* unlocked state and aio_map
	1994	*/
	1995	aio_proc_unlock(procp);
	1996	aio_free_request(entryp);
	1997
	1998	error_noalloc:
	1999	aio_decrement_total_count();
	2000
	2001	return( result );
	2002
	2003	} /* aio_queue_async_request */
	2004
	2005
	2006	/*
	2007	* lio_create_entry
	2008	*
	2009	* Allocate an aio_workq_entry and fill it in. If all goes well return 0
	2010	* and pass the aio_workq_entry pointer back to our caller.
	2011	*
	2012	* Parameters: procp The process makign the request
	2013	* aiocbp The aio context buffer pointer
	2014	* group_tag The group tag used to indicate a
	2015	* group of operations has completed
	2016	* entrypp Pointer to the pointer to receive the
	2017	* address of the created aio_workq_entry
	2018	*
	2019	* Returns: 0 Successfully created
	2020	* EAGAIN Try again (usually resource shortage)
	2021	*
	2022	*
	2023	* Notes: We get a reference to our caller's user land map in order
	2024	* to keep it around while we are processing the request.
	2025	*
	2026	* lio_listio calls behave differently at completion they do
	2027	* completion notification when all async IO requests have
	2028	* completed. We use group_tag to tag IO requests that behave
	2029	* in the delay notification manner.
	2030	*
	2031	* All synchronous operations are considered to not have a
	2032	* signal routine associated with them (sigp == USER_ADDR_NULL).
	2033	*/
	2034	static int
	2035	lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
	2036	aio_workq_entry **entrypp )
	2037	{
	2038	aio_workq_entry *entryp;
	2039	int result;
	2040
	2041	entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
	2042	if ( entryp == NULL ) {
	2043	result = EAGAIN;
	2044	goto error_exit;
	2045	}
	2046
	2047	/*
	2048	* Look for lio_listio LIO_NOP requests and ignore them; this is
	2049	* not really an error, but we need to free our aio_workq_entry.
	2050	*/
	2051	if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
	2052	result = 0;
	2053	goto error_exit;
	2054	}
	2055
	2056	*entrypp = entryp;
	2057	return( 0 );
	2058
	2059	error_exit:
	2060
	2061	if ( entryp != NULL ) {
	2062	/*
	2063	* This entry has not been queued up so no worries about
	2064	* unlocked state and aio_map
	2065	*/
	2066	aio_free_request(entryp);
	2067	}
	2068
	2069	return( result );
	2070
	2071	} /* lio_create_entry */
	2072
	2073
	2074	/*
	2075	* aio_free_request - remove our reference on the user land map and
	2076	* free the work queue entry resources. The entry is off all lists
	2077	* and has zero refcount, so no one can have a pointer to it.
	2078	*/
	2079
	2080	static int
	2081	aio_free_request(aio_workq_entry *entryp)
	2082	{
	2083	/* remove our reference to the user land map. */
	2084	if ( VM_MAP_NULL != entryp->aio_map) {
	2085	vm_map_deallocate(entryp->aio_map);
	2086	}
	2087
	2088	/* remove our reference to thread which enqueued the request */
	2089	if ( NULL != entryp->thread ) {
	2090	thread_deallocate( entryp->thread );
	2091	}
	2092
	2093	entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
	2094
	2095	zfree( aio_workq_zonep, entryp );
	2096
	2097	return( 0 );
	2098
	2099	} /* aio_free_request */
	2100
	2101
	2102	/*
	2103	* aio_validate
	2104	*
	2105	* validate the aiocb passed in by one of the aio syscalls.
	2106	*/
	2107	static int
	2108	aio_validate( aio_workq_entry *entryp )
	2109	{
	2110	struct fileproc *fp;
	2111	int flag;
	2112	int result;
	2113
	2114	result = 0;
	2115
	2116	if ( (entryp->flags & AIO_LIO) != 0 ) {
	2117	if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
	2118	entryp->flags \|= AIO_READ;
	2119	else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
	2120	entryp->flags \|= AIO_WRITE;
	2121	else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
	2122	return( 0 );
	2123	else
	2124	return( EINVAL );
	2125	}
	2126
	2127	flag = FREAD;
	2128	if ( (entryp->flags & (AIO_WRITE \| AIO_FSYNC \| AIO_DSYNC)) != 0 ) {
	2129	flag = FWRITE;
	2130	}
	2131
	2132	if ( (entryp->flags & (AIO_READ \| AIO_WRITE)) != 0 ) {
	2133	if ( entryp->aiocb.aio_nbytes > INT_MAX \|\|
	2134	entryp->aiocb.aio_buf == USER_ADDR_NULL \|\|
	2135	entryp->aiocb.aio_offset < 0 )
	2136	return( EINVAL );
	2137	}
	2138
	2139	/*
	2140	* validate aiocb.aio_sigevent. at this point we only support
	2141	* sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
	2142	* sigev_value, sigev_notify_function, and sigev_notify_attributes
	2143	* are ignored, since SIGEV_THREAD is unsupported. This is consistent
	2144	* with no [RTS] (RalTime Signal) option group support.
	2145	*/
	2146	switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
	2147	case SIGEV_SIGNAL:
	2148	{
	2149	int signum;
	2150
	2151	/* make sure we have a valid signal number */
	2152	signum = entryp->aiocb.aio_sigevent.sigev_signo;
	2153	if ( signum <= 0 \|\| signum >= NSIG \|\|
	2154	signum == SIGKILL \|\| signum == SIGSTOP )
	2155	return (EINVAL);
	2156	}
	2157	break;
	2158
	2159	case SIGEV_NONE:
	2160	break;
	2161
	2162	case SIGEV_THREAD:
	2163	/* Unsupported [RTS] */
	2164
	2165	default:
	2166	return (EINVAL);
	2167	}
	2168
	2169	/* validate the file descriptor and that the file was opened
	2170	* for the appropriate read / write access.
	2171	*/
	2172	proc_fdlock(entryp->procp);
	2173
	2174	result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
	2175	if ( result == 0 ) {
	2176	if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
	2177	/* we don't have read or write access */
	2178	result = EBADF;
	2179	}
	2180	else if ( FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE ) {
	2181	/* this is not a file */
	2182	result = ESPIPE;
	2183	} else
	2184	fp->f_flags \|= FP_AIOISSUED;
	2185
	2186	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
	2187	}
	2188	else {
	2189	result = EBADF;
	2190	}
	2191
	2192	proc_fdunlock(entryp->procp);
	2193
	2194	return( result );
	2195
	2196	} /* aio_validate */
	2197
	2198	static int
	2199	aio_increment_total_count()
	2200	{
	2201	return OSIncrementAtomic(&aio_anchor.aio_total_count);
	2202	}
	2203
	2204	static int
	2205	aio_decrement_total_count()
	2206	{
	2207	int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
	2208	if (old <= 0) {
	2209	panic("Negative total AIO count!\n");
	2210	}
	2211
	2212	return old;
	2213	}
	2214
	2215	static int
	2216	aio_get_process_count(proc_t procp )
	2217	{
	2218	return procp->p_aio_total_count;
	2219
	2220	} /* aio_get_process_count */
	2221
	2222	static int
	2223	aio_get_all_queues_count( void )
	2224	{
	2225	return aio_anchor.aio_total_count;
	2226
	2227	} /* aio_get_all_queues_count */
	2228
	2229
	2230	/*
	2231	* do_aio_completion. Handle async IO completion.
	2232	*/
	2233	static void
	2234	do_aio_completion( aio_workq_entry *entryp )
	2235	{
	2236
	2237	boolean_t lastLioCompleted = FALSE;
	2238	aio_lio_context *lio_context = NULL;
	2239	int waiter = 0;
	2240
	2241	lio_context = (aio_lio_context *)entryp->group_tag;
	2242
	2243	if (lio_context != NULL) {
	2244
	2245	aio_proc_lock_spin(entryp->procp);
	2246
	2247	/* Account for this I/O completing. */
	2248	lio_context->io_completed++;
	2249
	2250	/* Are we done with this lio context? */
	2251	if (lio_context->io_issued == lio_context->io_completed) {
	2252	lastLioCompleted = TRUE;
	2253	}
	2254
	2255	waiter = lio_context->io_waiter;
	2256
	2257	/* explicit wakeup of lio_listio() waiting in LIO_WAIT */
	2258	if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
	2259	/* wake up the waiter */
	2260	wakeup(lio_context);
	2261	}
	2262
	2263	aio_proc_unlock(entryp->procp);
	2264	}
	2265
	2266	if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
	2267	(entryp->flags & AIO_DISABLE) == 0 ) {
	2268
	2269	boolean_t performSignal = FALSE;
	2270	if (lio_context == NULL) {
	2271	performSignal = TRUE;
	2272	}
	2273	else {
	2274	/*
	2275	* If this was the last request in the group and a signal
	2276	* is desired, send one.
	2277	*/
	2278	performSignal = lastLioCompleted;
	2279	}
	2280
	2281	if (performSignal) {
	2282
	2283	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) \| DBG_FUNC_NONE,
	2284	(int)entryp->procp, (int)entryp->uaiocbp,
	2285	entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
	2286
	2287	psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
	2288	}
	2289	}
	2290
	2291	if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
	2292	panic("Close and exit flags set at the same time\n");
	2293	}
	2294
	2295	/*
	2296	* need to handle case where a process is trying to exit, exec, or
	2297	* close and is currently waiting for active aio requests to complete.
	2298	* If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
	2299	* other requests in the active queue for this process. If there are
	2300	* none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
	2301	* If there are some still active then do nothing - we only want to
	2302	* wakeup when all active aio requests for the process are complete.
	2303	*
	2304	* Don't need to lock the entry or proc to check the cleanup flag. It can only be
	2305	* set for cancellation, while the entryp is still on a proc list; now it's
	2306	* off, so that flag is already set if it's going to be.
	2307	*/
	2308	if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) {
	2309	int active_requests;
	2310
	2311	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) \| DBG_FUNC_NONE,
	2312	(int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
	2313
	2314	aio_proc_lock_spin(entryp->procp);
	2315	active_requests = aio_active_requests_for_process( entryp->procp );
	2316	if ( active_requests < 1 ) {
	2317	/*
	2318	* no active aio requests for this process, continue exiting. In this
	2319	* case, there should be no one else waiting ont he proc in AIO...
	2320	*/
	2321	wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
	2322	aio_proc_unlock(entryp->procp);
	2323
	2324	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) \| DBG_FUNC_NONE,
	2325	(int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
	2326	} else {
	2327	aio_proc_unlock(entryp->procp);
	2328	}
	2329	}
	2330
	2331	if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) {
	2332	int active_requests;
	2333
	2334	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) \| DBG_FUNC_NONE,
	2335	(int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
	2336
	2337	aio_proc_lock_spin(entryp->procp);
	2338	active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
	2339	if ( active_requests < 1 ) {
	2340	/* Can't wakeup_one(); multiple closes might be in progress. */
	2341	wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
	2342	aio_proc_unlock(entryp->procp);
	2343
	2344	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) \| DBG_FUNC_NONE,
	2345	(int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
	2346	} else {
	2347	aio_proc_unlock(entryp->procp);
	2348	}
	2349	}
	2350	/*
	2351	* A thread in aio_suspend() wants to known about completed IOs. If it checked
	2352	* the done list before we moved our AIO there, then it already asserted its wait,
	2353	* and we can wake it up without holding the lock. If it checked the list after
	2354	* we did our move, then it already has seen the AIO that we moved. Herego, we
	2355	* can do our wakeup without holding the lock.
	2356	*/
	2357	wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
	2358	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) \| DBG_FUNC_NONE,
	2359	(int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
	2360
	2361	/*
	2362	* free the LIO context if the last lio completed and no thread is
	2363	* waiting
	2364	*/
	2365	if (lastLioCompleted && (waiter == 0))
	2366	free_lio_context (lio_context);
	2367
	2368
	2369	} /* do_aio_completion */
	2370
	2371
	2372	/*
	2373	* do_aio_read
	2374	*/
	2375	static int
	2376	do_aio_read( aio_workq_entry *entryp )
	2377	{
	2378	struct fileproc *fp;
	2379	int error;
	2380	struct vfs_context context;
	2381
	2382	if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
	2383	return(error);
	2384	if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
	2385	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
	2386	return(EBADF);
	2387	}
	2388
	2389	context.vc_thread = entryp->thread; /* XXX */
	2390	context.vc_ucred = fp->f_fglob->fg_cred;
	2391
	2392	error = dofileread(&context, fp,
	2393	entryp->aiocb.aio_buf,
	2394	entryp->aiocb.aio_nbytes,
	2395	entryp->aiocb.aio_offset, FOF_OFFSET,
	2396	&entryp->returnval);
	2397	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
	2398
	2399	return( error );
	2400
	2401	} /* do_aio_read */
	2402
	2403
	2404	/*
	2405	* do_aio_write
	2406	*/
	2407	static int
	2408	do_aio_write( aio_workq_entry *entryp )
	2409	{
	2410	struct fileproc *fp;
	2411	int error, flags;
	2412	struct vfs_context context;
	2413
	2414	if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
	2415	return(error);
	2416	if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
	2417	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
	2418	return(EBADF);
	2419	}
	2420
	2421	flags = FOF_PCRED;
	2422	if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) {
	2423	flags \|= FOF_OFFSET;
	2424	}
	2425
	2426	context.vc_thread = entryp->thread; /* XXX */
	2427	context.vc_ucred = fp->f_fglob->fg_cred;
	2428
	2429	/* NB: tell dofilewrite the offset, and to use the proc cred */
	2430	error = dofilewrite(&context,
	2431	fp,
	2432	entryp->aiocb.aio_buf,
	2433	entryp->aiocb.aio_nbytes,
	2434	entryp->aiocb.aio_offset,
	2435	flags,
	2436	&entryp->returnval);
	2437
	2438	if (entryp->returnval)
	2439	fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
	2440	else
	2441	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
	2442
	2443	return( error );
	2444
	2445	} /* do_aio_write */
	2446
	2447
	2448	/*
	2449	* aio_active_requests_for_process - return number of active async IO
	2450	* requests for the given process.
	2451	*/
	2452	static int
	2453	aio_active_requests_for_process(proc_t procp )
	2454	{
	2455	return( procp->p_aio_active_count );
	2456
	2457	} /* aio_active_requests_for_process */
	2458
	2459	/*
	2460	* Called with the proc locked.
	2461	*/
	2462	static int
	2463	aio_proc_active_requests_for_file(proc_t procp, int fd)
	2464	{
	2465	int count = 0;
	2466	aio_workq_entry *entryp;
	2467	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
	2468	if (entryp->aiocb.aio_fildes == fd) {
	2469	count++;
	2470	}
	2471	}
	2472
	2473	return count;
	2474	} /* aio_active_requests_for_process */
	2475
	2476
	2477
	2478	/*
	2479	* do_aio_fsync
	2480	*/
	2481	static int
	2482	do_aio_fsync( aio_workq_entry *entryp )
	2483	{
	2484	struct vfs_context context;
	2485	struct vnode *vp;
	2486	struct fileproc *fp;
	2487	int sync_flag;
	2488	int error;
	2489
	2490	/*
	2491	* We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
	2492	*
	2493	* If AIO_DSYNC is set, we can tell the lower layers that it is OK
	2494	* to mark for update the metadata not strictly necessary for data
	2495	* retrieval, rather than forcing it to disk.
	2496	*
	2497	* If AIO_FSYNC is set, we have to also wait for metadata not really
	2498	* necessary to data retrival are committed to stable storage (e.g.
	2499	* atime, mtime, ctime, etc.).
	2500	*
	2501	* Metadata necessary for data retrieval ust be committed to stable
	2502	* storage in either case (file length, etc.).
	2503	*/
	2504	if (entryp->flags & AIO_FSYNC)
	2505	sync_flag = MNT_WAIT;
	2506	else
	2507	sync_flag = MNT_DWAIT;
	2508
	2509	error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
	2510	if ( error == 0 ) {
	2511	if ( (error = vnode_getwithref(vp)) ) {
	2512	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
	2513	entryp->returnval = -1;
	2514	return(error);
	2515	}
	2516	context.vc_thread = current_thread();
	2517	context.vc_ucred = fp->f_fglob->fg_cred;
	2518
	2519	error = VNOP_FSYNC( vp, sync_flag, &context);
	2520
	2521	(void)vnode_put(vp);
	2522
	2523	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
	2524	}
	2525	if ( error != 0 )
	2526	entryp->returnval = -1;
	2527
	2528	return( error );
	2529
	2530	} /* do_aio_fsync */
	2531
	2532
	2533	/*
	2534	* is_already_queued - runs through our queues to see if the given
	2535	* aiocbp / process is there. Returns TRUE if there is a match
	2536	* on any of our aio queues.
	2537	*
	2538	* Called with proc aio lock held (can be held spin)
	2539	*/
	2540	static boolean_t
	2541	is_already_queued(proc_t procp,
	2542	user_addr_t aiocbp )
	2543	{
	2544	aio_workq_entry *entryp;
	2545	boolean_t result;
	2546
	2547	result = FALSE;
	2548
	2549	/* look for matches on our queue of async IO requests that have completed */
	2550	TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
	2551	if ( aiocbp == entryp->uaiocbp ) {
	2552	result = TRUE;
	2553	goto ExitThisRoutine;
	2554	}
	2555	}
	2556
	2557	/* look for matches on our queue of active async IO requests */
	2558	TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
	2559	if ( aiocbp == entryp->uaiocbp ) {
	2560	result = TRUE;
	2561	goto ExitThisRoutine;
	2562	}
	2563	}
	2564
	2565	ExitThisRoutine:
	2566	return( result );
	2567
	2568	} /* is_already_queued */
	2569
	2570
	2571	static void
	2572	free_lio_context(aio_lio_context* context)
	2573	{
	2574
	2575	#if DEBUG
	2576	OSDecrementAtomic(&lio_contexts_alloced);
	2577	#endif /* DEBUG */
	2578
	2579	FREE( context, M_TEMP );
	2580
	2581	} /* free_lio_context */
	2582
	2583
	2584	/*
	2585	* aio initialization
	2586	*/
	2587	__private_extern__ void
	2588	aio_init( void )
	2589	{
	2590	int i;
	2591
	2592	aio_lock_grp_attr = lck_grp_attr_alloc_init();
	2593	aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
	2594	aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
	2595	aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
	2596	aio_lock_attr = lck_attr_alloc_init();
	2597
	2598	lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
	2599	lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
	2600
	2601	aio_anchor.aio_inflight_count = 0;
	2602	aio_anchor.aio_done_count = 0;
	2603	aio_anchor.aio_total_count = 0;
	2604	aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
	2605
	2606	for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
	2607	aio_workq_init(&aio_anchor.aio_async_workqs[i]);
	2608	}
	2609
	2610
	2611	i = sizeof( aio_workq_entry );
	2612	aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
	2613
	2614	_aio_create_worker_threads( aio_worker_threads );
	2615
	2616	} /* aio_init */
	2617
	2618
	2619	/*
	2620	* aio worker threads created here.
	2621	*/
	2622	__private_extern__ void
	2623	_aio_create_worker_threads( int num )
	2624	{
	2625	int i;
	2626
	2627	/* create some worker threads to handle the async IO requests */
	2628	for ( i = 0; i < num; i++ ) {
	2629	thread_t myThread;
	2630
	2631	if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
	2632	printf( "%s - failed to create a work thread \n", __FUNCTION__ );
	2633	}
	2634	else
	2635	thread_deallocate(myThread);
	2636	}
	2637
	2638	return;
	2639
	2640	} /* _aio_create_worker_threads */
	2641
	2642	/*
	2643	* Return the current activation utask
	2644	*/
	2645	task_t
	2646	get_aiotask(void)
	2647	{
	2648	return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
	2649	}
	2650
	2651
	2652	/*
	2653	* In the case of an aiocb from a
	2654	* 32-bit process we need to expand some longs and pointers to the correct
	2655	* sizes in order to let downstream code always work on the same type of
	2656	* aiocb (in our case that is a user_aiocb)
	2657	*/
	2658	static void
	2659	do_munge_aiocb_user32_to_user( struct user32_aiocb my_aiocbp, struct user_aiocb the_user_aiocbp )
	2660	{
	2661	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
	2662	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
	2663	the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
	2664	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
	2665	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
	2666	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
	2667
	2668	/* special case here. since we do not know if sigev_value is an */
	2669	/* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
	2670	/* means if we send this info back to user space we need to remember */
	2671	/* sigev_value was not expanded for the 32-bit case. */
	2672	/* NOTE - this does NOT affect us since we don't support sigev_value */
	2673	/* yet in the aio context. */
	2674	//LP64
	2675	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
	2676	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
	2677	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
	2678	my_aiocbp->aio_sigevent.sigev_value.sival_int;
	2679	the_user_aiocbp->aio_sigevent.sigev_notify_function =
	2680	CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
	2681	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
	2682	CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
	2683	}
	2684
	2685	/* Similar for 64-bit user process, so that we don't need to satisfy
	2686	* the alignment constraints of the original user64_aiocb
	2687	*/
	2688	static void
	2689	do_munge_aiocb_user64_to_user( struct user64_aiocb my_aiocbp, struct user_aiocb the_user_aiocbp )
	2690	{
	2691	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
	2692	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
	2693	the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
	2694	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
	2695	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
	2696	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
	2697
	2698	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
	2699	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
	2700	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
	2701	my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
	2702	the_user_aiocbp->aio_sigevent.sigev_notify_function =
	2703	my_aiocbp->aio_sigevent.sigev_notify_function;
	2704	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
	2705	my_aiocbp->aio_sigevent.sigev_notify_attributes;
	2706	}