git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2020 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29
	30	/*
	31	* todo:
	32	* 1) ramesh is looking into how to replace taking a reference on
	33	* the user's map (vm_map_reference()) since it is believed that
	34	* would not hold the process for us.
	35	* 2) david is looking into a way for us to set the priority of the
	36	* worker threads to match that of the user's thread when the
	37	* async IO was queued.
	38	*/
	39
	40
	41	/*
	42	* This file contains support for the POSIX 1003.1B AIO/LIO facility.
	43	*/
	44
	45	#include <sys/systm.h>
	46	#include <sys/fcntl.h>
	47	#include <sys/file_internal.h>
	48	#include <sys/filedesc.h>
	49	#include <sys/kernel.h>
	50	#include <sys/vnode_internal.h>
	51	#include <sys/malloc.h>
	52	#include <sys/mount_internal.h>
	53	#include <sys/param.h>
	54	#include <sys/proc_internal.h>
	55	#include <sys/sysctl.h>
	56	#include <sys/unistd.h>
	57	#include <sys/user.h>
	58
	59	#include <sys/aio_kern.h>
	60	#include <sys/sysproto.h>
	61
	62	#include <machine/limits.h>
	63
	64	#include <mach/mach_types.h>
	65	#include <kern/kern_types.h>
	66	#include <kern/waitq.h>
	67	#include <kern/zalloc.h>
	68	#include <kern/task.h>
	69	#include <kern/sched_prim.h>
	70
	71	#include <vm/vm_map.h>
	72
	73	#include <os/refcnt.h>
	74
	75	#include <sys/kdebug.h>
	76	#define AIO_work_queued 1
	77	#define AIO_worker_wake 2
	78	#define AIO_completion_sig 3
	79	#define AIO_completion_cleanup_wait 4
	80	#define AIO_completion_cleanup_wake 5
	81	#define AIO_completion_suspend_wake 6
	82	#define AIO_fsync_delay 7
	83	#define AIO_cancel 10
	84	#define AIO_cancel_async_workq 11
	85	#define AIO_cancel_sync_workq 12
	86	#define AIO_cancel_activeq 13
	87	#define AIO_cancel_doneq 14
	88	#define AIO_fsync 20
	89	#define AIO_read 30
	90	#define AIO_write 40
	91	#define AIO_listio 50
	92	#define AIO_error 60
	93	#define AIO_error_val 61
	94	#define AIO_error_activeq 62
	95	#define AIO_error_workq 63
	96	#define AIO_return 70
	97	#define AIO_return_val 71
	98	#define AIO_return_activeq 72
	99	#define AIO_return_workq 73
	100	#define AIO_exec 80
	101	#define AIO_exit 90
	102	#define AIO_exit_sleep 91
	103	#define AIO_close 100
	104	#define AIO_close_sleep 101
	105	#define AIO_suspend 110
	106	#define AIO_suspend_sleep 111
	107	#define AIO_worker_thread 120
	108
	109	__options_decl(aio_entry_flags_t, uint32_t, {
	110	AIO_READ = 0x00000001, /* a read */
	111	AIO_WRITE = 0x00000002, /* a write */
	112	AIO_FSYNC = 0x00000004, /* aio_fsync with op = O_SYNC */
	113	AIO_DSYNC = 0x00000008, /* aio_fsync with op = O_DSYNC (not supported yet) */
	114	AIO_LIO = 0x00000010, /* lio_listio generated IO */
	115	AIO_LIO_WAIT = 0x00000020, /* lio_listio is waiting on the leader */
	116
	117	/*
	118	* These flags mean that this entry is blocking either:
	119	* - close (AIO_CLOSE_WAIT)
	120	* - exit or exec (AIO_EXIT_WAIT)
	121	*
	122	* These flags are mutually exclusive, and the AIO_EXIT_WAIT variant
	123	* will also neuter notifications in do_aio_completion_and_unlock().
	124	*/
	125	AIO_CLOSE_WAIT = 0x00004000,
	126	AIO_EXIT_WAIT = 0x00008000,
	127	});
	128
	129	/*! @struct aio_workq_entry
	130	*
	131	* @discussion
	132	* This represents a piece of aio/lio work.
	133	*
	134	* The ownership rules go as follows:
	135	*
	136	* - the "proc" owns one refcount on the entry (from creation), while it is
	137	* enqueued on the aio_activeq and then the aio_doneq.
	138	*
	139	* either aio_return() (user read the status) or _aio_exit() (the process
	140	* died) will dequeue the entry and consume this ref.
	141	*
	142	* - the async workqueue owns one refcount once the work is submitted,
	143	* which is consumed in do_aio_completion_and_unlock().
	144	*
	145	* This ref protects the entry for the the end of
	146	* do_aio_completion_and_unlock() (when signal delivery happens).
	147	*
	148	* - lio_listio() for batches picks one of the entries to be the "leader"
	149	* of the batch. Each work item will have a refcount on its leader
	150	* so that the accounting of the batch completion can be done on the leader
	151	* (to be able to decrement lio_pending).
	152	*
	153	* This ref is consumed in do_aio_completion_and_unlock() as well.
	154	*
	155	* - lastly, in lio_listio() when the LIO_WAIT behavior is requested,
	156	* an extra ref is taken in this syscall as it needs to keep accessing
	157	* the leader "lio_pending" field until it hits 0.
	158	*/
	159	struct aio_workq_entry {
	160	/* queue lock */
	161	TAILQ_ENTRY(aio_workq_entry) aio_workq_link;
	162
	163	/* Proc lock */
	164	TAILQ_ENTRY(aio_workq_entry) aio_proc_link; /* p_aio_activeq or p_aio_doneq */
	165	user_ssize_t returnval; /* return value from read / write request */
	166	errno_t errorval; /* error value from read / write request */
	167	os_refcnt_t aio_refcount;
	168	aio_entry_flags_t flags;
	169
	170	int lio_pending; /* pending I/Os in lio group, only on leader */
	171	struct aio_workq_entry lio_leader; / pointer to the lio leader, can be self */
	172
	173	/* Initialized and never changed, safe to access */
	174	struct proc procp; / user proc that queued this request */
	175	user_addr_t uaiocbp; /* pointer passed in from user land */
	176	struct user_aiocb aiocb; /* copy of aiocb from user land */
	177	thread_t thread; /* thread that queued this request */
	178
	179	/* Initialized, and possibly freed by aio_work_thread() or at free if cancelled */
	180	vm_map_t aio_map; /* user land map we have a reference to */
	181	};
	182
	183	/*
	184	* aio requests queue up on the aio_async_workq or lio_sync_workq (for
	185	* lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
	186	* (proc.aio_activeq) when one of our worker threads start the IO.
	187	* And finally, requests move to the per process aio_doneq (proc.aio_doneq)
	188	* when the IO request completes. The request remains on aio_doneq until
	189	* user process calls aio_return or the process exits, either way that is our
	190	* trigger to release aio resources.
	191	*/
	192	typedef struct aio_workq {
	193	TAILQ_HEAD(, aio_workq_entry) aioq_entries;
	194	lck_spin_t aioq_lock;
	195	struct waitq aioq_waitq;
	196	} *aio_workq_t;
	197
	198	#define AIO_NUM_WORK_QUEUES 1
	199	struct aio_anchor_cb {
	200	os_atomic(int) aio_total_count; /* total extant entries */
	201
	202	/* Hash table of queues here */
	203	int aio_num_workqs;
	204	struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
	205	};
	206	typedef struct aio_anchor_cb aio_anchor_cb;
	207
	208	/*
	209	* Notes on aio sleep / wake channels.
	210	* We currently pick a couple fields within the proc structure that will allow
	211	* us sleep channels that currently do not collide with any other kernel routines.
	212	* At this time, for binary compatibility reasons, we cannot create new proc fields.
	213	*/
	214	#define AIO_SUSPEND_SLEEP_CHAN p_aio_activeq
	215	#define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
	216
	217	#define ASSERT_AIO_FROM_PROC(aiop, theproc) \
	218	if ((aiop)->procp != (theproc)) { \
	219	panic("AIO on a proc list that does not belong to that proc.\n"); \
	220	}
	221
	222	/*
	223	* LOCAL PROTOTYPES
	224	*/
	225	static void aio_proc_lock(proc_t procp);
	226	static void aio_proc_lock_spin(proc_t procp);
	227	static void aio_proc_unlock(proc_t procp);
	228	static lck_mtx_t *aio_proc_mutex(proc_t procp);
	229	static bool aio_has_active_requests_for_process(proc_t procp);
	230	static bool aio_proc_has_active_requests_for_file(proc_t procp, int fd);
	231	static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp);
	232
	233	static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
	234	static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
	235	static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
	236	static void aio_entry_ref(aio_workq_entry *entryp);
	237	static void aio_entry_unref(aio_workq_entry *entryp);
	238	static bool aio_entry_try_workq_remove(aio_workq_entry *entryp);
	239	static boolean_t aio_delay_fsync_request(aio_workq_entry *entryp);
	240	static void aio_free_request(aio_workq_entry *entryp);
	241
	242	static void aio_workq_init(aio_workq_t wq);
	243	static void aio_workq_lock_spin(aio_workq_t wq);
	244	static void aio_workq_unlock(aio_workq_t wq);
	245	static lck_spin_t *aio_workq_lock(aio_workq_t wq);
	246
	247	static void aio_work_thread(void *arg, wait_result_t wr);
	248	static aio_workq_entry *aio_get_some_work(void);
	249
	250	static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
	251	static int aio_validate(proc_t, aio_workq_entry *entryp);
	252
	253	static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, aio_entry_flags_t);
	254	static void do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp);
	255	static int do_aio_fsync(aio_workq_entry *entryp);
	256	static int do_aio_read(aio_workq_entry *entryp);
	257	static int do_aio_write(aio_workq_entry *entryp);
	258	static void do_munge_aiocb_user32_to_user(struct user32_aiocb my_aiocbp, struct user_aiocb the_user_aiocbp);
	259	static void do_munge_aiocb_user64_to_user(struct user64_aiocb my_aiocbp, struct user_aiocb the_user_aiocbp);
	260	static aio_workq_entry *aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
	261	static int aio_copy_in_list(proc_t, user_addr_t, user_addr_t *, int);
	262
	263	#define ASSERT_AIO_PROC_LOCK_OWNED(p) LCK_MTX_ASSERT(aio_proc_mutex(p), LCK_MTX_ASSERT_OWNED)
	264	#define ASSERT_AIO_WORKQ_LOCK_OWNED(q) LCK_SPIN_ASSERT(aio_workq_lock(q), LCK_ASSERT_OWNED)
	265
	266	/*
	267	* EXTERNAL PROTOTYPES
	268	*/
	269
	270	/* in ...bsd/kern/sys_generic.c */
	271	extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
	272	user_addr_t bufp, user_size_t nbyte,
	273	off_t offset, int flags, user_ssize_t *retval);
	274	extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
	275	user_addr_t bufp, user_size_t nbyte, off_t offset,
	276	int flags, user_ssize_t *retval);
	277
	278	/*
	279	* aio external global variables.
	280	*/
	281	extern int aio_max_requests; /* AIO_MAX - configurable */
	282	extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
	283	extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
	284
	285
	286	/*
	287	* aio static variables.
	288	*/
	289	static aio_anchor_cb aio_anchor = {
	290	.aio_num_workqs = AIO_NUM_WORK_QUEUES,
	291	};
	292	os_refgrp_decl(static, aio_refgrp, "aio", NULL);
	293	static LCK_GRP_DECLARE(aio_proc_lock_grp, "aio_proc");
	294	static LCK_GRP_DECLARE(aio_queue_lock_grp, "aio_queue");
	295	static LCK_MTX_DECLARE(aio_proc_mtx, &aio_proc_lock_grp);
	296
	297	static ZONE_DECLARE(aio_workq_zonep, "aiowq", sizeof(aio_workq_entry),
	298	ZC_ZFREE_CLEARMEM);
	299
	300	/* Hash */
	301	static aio_workq_t
	302	aio_entry_workq(__unused aio_workq_entry *entryp)
	303	{
	304	return &aio_anchor.aio_async_workqs[0];
	305	}
	306
	307	static void
	308	aio_workq_init(aio_workq_t wq)
	309	{
	310	TAILQ_INIT(&wq->aioq_entries);
	311	lck_spin_init(&wq->aioq_lock, &aio_queue_lock_grp, LCK_ATTR_NULL);
	312	waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
	313	}
	314
	315
	316	/*
	317	* Can be passed a queue which is locked spin.
	318	*/
	319	static void
	320	aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
	321	{
	322	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
	323
	324	if (entryp->aio_workq_link.tqe_prev == NULL) {
	325	panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
	326	}
	327
	328	TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
	329	entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
	330	}
	331
	332	static void
	333	aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
	334	{
	335	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
	336
	337	TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
	338	}
	339
	340	static void
	341	aio_proc_lock(proc_t procp)
	342	{
	343	lck_mtx_lock(aio_proc_mutex(procp));
	344	}
	345
	346	static void
	347	aio_proc_lock_spin(proc_t procp)
	348	{
	349	lck_mtx_lock_spin(aio_proc_mutex(procp));
	350	}
	351
	352	static bool
	353	aio_has_any_work(void)
	354	{
	355	return os_atomic_load(&aio_anchor.aio_total_count, relaxed) != 0;
	356	}
	357
	358	static bool
	359	aio_try_proc_insert_active_locked(proc_t procp, aio_workq_entry *entryp)
	360	{
	361	int old, new;
	362
	363	ASSERT_AIO_PROC_LOCK_OWNED(procp);
	364
	365	if (procp->p_aio_total_count >= aio_max_requests_per_process) {
	366	return false;
	367	}
	368
	369	if (is_already_queued(procp, entryp->uaiocbp)) {
	370	return false;
	371	}
	372
	373	os_atomic_rmw_loop(&aio_anchor.aio_total_count, old, new, relaxed, {
	374	if (old >= aio_max_requests) {
	375	os_atomic_rmw_loop_give_up(return false);
	376	}
	377	new = old + 1;
	378	});
	379
	380	TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
	381	procp->p_aio_total_count++;
	382	return true;
	383	}
	384
	385	static void
	386	aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
	387	{
	388	TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link);
	389	TAILQ_INSERT_TAIL(&procp->p_aio_doneq, entryp, aio_proc_link);
	390	}
	391
	392	static void
	393	aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
	394	{
	395	TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
	396	entryp->aio_proc_link.tqe_prev = NULL;
	397	if (os_atomic_dec_orig(&aio_anchor.aio_total_count, relaxed) <= 0) {
	398	panic("Negative total AIO count!\n");
	399	}
	400	if (procp->p_aio_total_count-- <= 0) {
	401	panic("proc %p: p_aio_total_count accounting mismatch", procp);
	402	}
	403	}
	404
	405	static void
	406	aio_proc_unlock(proc_t procp)
	407	{
	408	lck_mtx_unlock(aio_proc_mutex(procp));
	409	}
	410
	411	static lck_mtx_t*
	412	aio_proc_mutex(proc_t procp)
	413	{
	414	return &procp->p_mlock;
	415	}
	416
	417	static void
	418	aio_entry_ref(aio_workq_entry *entryp)
	419	{
	420	os_ref_retain(&entryp->aio_refcount);
	421	}
	422
	423	static void
	424	aio_entry_unref(aio_workq_entry *entryp)
	425	{
	426	if (os_ref_release(&entryp->aio_refcount) == 0) {
	427	aio_free_request(entryp);
	428	}
	429	}
	430
	431	static bool
	432	aio_entry_try_workq_remove(aio_workq_entry *entryp)
	433	{
	434	/* Can only be cancelled if it's still on a work queue */
	435	if (entryp->aio_workq_link.tqe_prev != NULL) {
	436	aio_workq_t queue;
	437
	438	/* Will have to check again under the lock */
	439	queue = aio_entry_workq(entryp);
	440	aio_workq_lock_spin(queue);
	441	if (entryp->aio_workq_link.tqe_prev != NULL) {
	442	aio_workq_remove_entry_locked(queue, entryp);
	443	aio_workq_unlock(queue);
	444	return true;
	445	} else {
	446	aio_workq_unlock(queue);
	447	}
	448	}
	449
	450	return false;
	451	}
	452
	453	static void
	454	aio_workq_lock_spin(aio_workq_t wq)
	455	{
	456	lck_spin_lock(aio_workq_lock(wq));
	457	}
	458
	459	static void
	460	aio_workq_unlock(aio_workq_t wq)
	461	{
	462	lck_spin_unlock(aio_workq_lock(wq));
	463	}
	464
	465	static lck_spin_t*
	466	aio_workq_lock(aio_workq_t wq)
	467	{
	468	return &wq->aioq_lock;
	469	}
	470
	471	/*
	472	* aio_cancel - attempt to cancel one or more async IO requests currently
	473	* outstanding against file descriptor uap->fd. If uap->aiocbp is not
	474	* NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
	475	* is NULL then all outstanding async IO request for the given file
	476	* descriptor are cancelled (if possible).
	477	*/
	478	int
	479	aio_cancel(proc_t p, struct aio_cancel_args uap, int retval)
	480	{
	481	struct user_aiocb my_aiocb;
	482	int result;
	483
	484	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) \| DBG_FUNC_START,
	485	VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
	486
	487	/* quick check to see if there are any async IO requests queued up */
	488	if (!aio_has_any_work()) {
	489	result = 0;
	490	*retval = AIO_ALLDONE;
	491	goto ExitRoutine;
	492	}
	493
	494	*retval = -1;
	495	if (uap->aiocbp != USER_ADDR_NULL) {
	496	if (proc_is64bit(p)) {
	497	struct user64_aiocb aiocb64;
	498
	499	result = copyin(uap->aiocbp, &aiocb64, sizeof(aiocb64));
	500	if (result == 0) {
	501	do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
	502	}
	503	} else {
	504	struct user32_aiocb aiocb32;
	505
	506	result = copyin(uap->aiocbp, &aiocb32, sizeof(aiocb32));
	507	if (result == 0) {
	508	do_munge_aiocb_user32_to_user(&aiocb32, &my_aiocb);
	509	}
	510	}
	511
	512	if (result != 0) {
	513	result = EAGAIN;
	514	goto ExitRoutine;
	515	}
	516
	517	/* NOTE - POSIX standard says a mismatch between the file */
	518	/* descriptor passed in and the file descriptor embedded in */
	519	/* the aiocb causes unspecified results. We return EBADF in */
	520	/* that situation. */
	521	if (uap->fd != my_aiocb.aio_fildes) {
	522	result = EBADF;
	523	goto ExitRoutine;
	524	}
	525	}
	526
	527	aio_proc_lock(p);
	528	result = do_aio_cancel_locked(p, uap->fd, uap->aiocbp, 0);
	529	ASSERT_AIO_PROC_LOCK_OWNED(p);
	530	aio_proc_unlock(p);
	531
	532	if (result != -1) {
	533	*retval = result;
	534	result = 0;
	535	goto ExitRoutine;
	536	}
	537
	538	result = EBADF;
	539
	540	ExitRoutine:
	541	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) \| DBG_FUNC_END,
	542	VM_KERNEL_ADDRPERM(p), uap->aiocbp, result, 0, 0);
	543
	544	return result;
	545	}
	546
	547
	548	/*
	549	* _aio_close - internal function used to clean up async IO requests for
	550	* a file descriptor that is closing.
	551	* THIS MAY BLOCK.
	552	*/
	553	__private_extern__ void
	554	_aio_close(proc_t p, int fd)
	555	{
	556	int error;
	557
	558	/* quick check to see if there are any async IO requests queued up */
	559	if (!aio_has_any_work()) {
	560	return;
	561	}
	562
	563	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) \| DBG_FUNC_START,
	564	VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
	565
	566	/* cancel all async IO requests on our todo queues for this file descriptor */
	567	aio_proc_lock(p);
	568	error = do_aio_cancel_locked(p, fd, USER_ADDR_NULL, AIO_CLOSE_WAIT);
	569	ASSERT_AIO_PROC_LOCK_OWNED(p);
	570	if (error == AIO_NOTCANCELED) {
	571	/*
	572	* AIO_NOTCANCELED is returned when we find an aio request for this process
	573	* and file descriptor on the active async IO queue. Active requests cannot
	574	* be cancelled so we must wait for them to complete. We will get a special
	575	* wake up call on our channel used to sleep for ALL active requests to
	576	* complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
	577	* when we must wait for all active aio requests.
	578	*/
	579
	580	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep) \| DBG_FUNC_NONE,
	581	VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
	582
	583	while (aio_proc_has_active_requests_for_file(p, fd)) {
	584	msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0);
	585	}
	586	}
	587
	588	aio_proc_unlock(p);
	589
	590	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) \| DBG_FUNC_END,
	591	VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
	592	}
	593
	594
	595	/*
	596	* aio_error - return the error status associated with the async IO
	597	* request referred to by uap->aiocbp. The error status is the errno
	598	* value that would be set by the corresponding IO request (read, wrtie,
	599	* fdatasync, or sync).
	600	*/
	601	int
	602	aio_error(proc_t p, struct aio_error_args uap, int retval)
	603	{
	604	aio_workq_entry *entryp;
	605	int error;
	606
	607	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) \| DBG_FUNC_START,
	608	VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
	609
	610	/* see if there are any aios to check */
	611	if (!aio_has_any_work()) {
	612	return EINVAL;
	613	}
	614
	615	aio_proc_lock(p);
	616
	617	/* look for a match on our queue of async IO requests that have completed */
	618	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
	619	if (entryp->uaiocbp == uap->aiocbp) {
	620	ASSERT_AIO_FROM_PROC(entryp, p);
	621
	622	*retval = entryp->errorval;
	623	error = 0;
	624
	625	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val) \| DBG_FUNC_NONE,
	626	VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
	627	goto ExitRoutine;
	628	}
	629	}
	630
	631	/* look for a match on our queue of active async IO requests */
	632	TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
	633	if (entryp->uaiocbp == uap->aiocbp) {
	634	ASSERT_AIO_FROM_PROC(entryp, p);
	635	*retval = EINPROGRESS;
	636	error = 0;
	637	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq) \| DBG_FUNC_NONE,
	638	VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
	639	goto ExitRoutine;
	640	}
	641	}
	642
	643	error = EINVAL;
	644
	645	ExitRoutine:
	646	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) \| DBG_FUNC_END,
	647	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
	648	aio_proc_unlock(p);
	649
	650	return error;
	651	}
	652
	653
	654	/*
	655	* aio_fsync - asynchronously force all IO operations associated
	656	* with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
	657	* queued at the time of the call to the synchronized completion state.
	658	* NOTE - we do not support op O_DSYNC at this point since we do not support the
	659	* fdatasync() call.
	660	*/
	661	int
	662	aio_fsync(proc_t p, struct aio_fsync_args uap, int retval)
	663	{
	664	aio_entry_flags_t fsync_kind;
	665	int error;
	666
	667	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) \| DBG_FUNC_START,
	668	VM_KERNEL_ADDRPERM(p), uap->aiocbp, uap->op, 0, 0);
	669
	670	*retval = 0;
	671	/* 0 := O_SYNC for binary backward compatibility with Panther */
	672	if (uap->op == O_SYNC \|\| uap->op == 0) {
	673	fsync_kind = AIO_FSYNC;
	674	} else if (uap->op == O_DSYNC) {
	675	fsync_kind = AIO_DSYNC;
	676	} else {
	677	*retval = -1;
	678	error = EINVAL;
	679	goto ExitRoutine;
	680	}
	681
	682	error = aio_queue_async_request(p, uap->aiocbp, fsync_kind);
	683	if (error != 0) {
	684	*retval = -1;
	685	}
	686
	687	ExitRoutine:
	688	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) \| DBG_FUNC_END,
	689	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
	690
	691	return error;
	692	}
	693
	694
	695	/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
	696	* file descriptor (uap->aiocbp->aio_fildes) into the buffer
	697	* (uap->aiocbp->aio_buf).
	698	*/
	699	int
	700	aio_read(proc_t p, struct aio_read_args uap, int retval)
	701	{
	702	int error;
	703
	704	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) \| DBG_FUNC_START,
	705	VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
	706
	707	*retval = 0;
	708
	709	error = aio_queue_async_request(p, uap->aiocbp, AIO_READ);
	710	if (error != 0) {
	711	*retval = -1;
	712	}
	713
	714	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) \| DBG_FUNC_END,
	715	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
	716
	717	return error;
	718	}
	719
	720
	721	/*
	722	* aio_return - return the return status associated with the async IO
	723	* request referred to by uap->aiocbp. The return status is the value
	724	* that would be returned by corresponding IO request (read, write,
	725	* fdatasync, or sync). This is where we release kernel resources
	726	* held for async IO call associated with the given aiocb pointer.
	727	*/
	728	int
	729	aio_return(proc_t p, struct aio_return_args uap, user_ssize_t retval)
	730	{
	731	aio_workq_entry *entryp;
	732	int error = EINVAL;
	733
	734	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) \| DBG_FUNC_START,
	735	VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
	736
	737	/* See if there are any entries to check */
	738	if (!aio_has_any_work()) {
	739	goto ExitRoutine;
	740	}
	741
	742	aio_proc_lock(p);
	743	*retval = 0;
	744
	745	/* look for a match on our queue of async IO requests that have completed */
	746	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
	747	ASSERT_AIO_FROM_PROC(entryp, p);
	748	if (entryp->uaiocbp == uap->aiocbp) {
	749	/* Done and valid for aio_return(), pull it off the list */
	750	aio_proc_remove_done_locked(p, entryp);
	751
	752	*retval = entryp->returnval;
	753	error = 0;
	754	aio_proc_unlock(p);
	755
	756	aio_entry_unref(entryp);
	757
	758	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val) \| DBG_FUNC_NONE,
	759	VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
	760	goto ExitRoutine;
	761	}
	762	}
	763
	764	/* look for a match on our queue of active async IO requests */
	765	TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
	766	ASSERT_AIO_FROM_PROC(entryp, p);
	767	if (entryp->uaiocbp == uap->aiocbp) {
	768	error = EINPROGRESS;
	769	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq) \| DBG_FUNC_NONE,
	770	VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
	771	break;
	772	}
	773	}
	774
	775	aio_proc_unlock(p);
	776
	777	ExitRoutine:
	778	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) \| DBG_FUNC_END,
	779	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
	780
	781	return error;
	782	}
	783
	784
	785	/*
	786	* _aio_exec - internal function used to clean up async IO requests for
	787	* a process that is going away due to exec(). We cancel any async IOs
	788	* we can and wait for those already active. We also disable signaling
	789	* for cancelled or active aio requests that complete.
	790	* This routine MAY block!
	791	*/
	792	__private_extern__ void
	793	_aio_exec(proc_t p)
	794	{
	795	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) \| DBG_FUNC_START,
	796	VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
	797
	798	_aio_exit(p);
	799
	800	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) \| DBG_FUNC_END,
	801	VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
	802	}
	803
	804
	805	/*
	806	* _aio_exit - internal function used to clean up async IO requests for
	807	* a process that is terminating (via exit() or exec()). We cancel any async IOs
	808	* we can and wait for those already active. We also disable signaling
	809	* for cancelled or active aio requests that complete. This routine MAY block!
	810	*/
	811	__private_extern__ void
	812	_aio_exit(proc_t p)
	813	{
	814	TAILQ_HEAD(, aio_workq_entry) tofree = TAILQ_HEAD_INITIALIZER(tofree);
	815	aio_workq_entry entryp, tmp;
	816	int error;
	817
	818	/* quick check to see if there are any async IO requests queued up */
	819	if (!aio_has_any_work()) {
	820	return;
	821	}
	822
	823	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) \| DBG_FUNC_START,
	824	VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
	825
	826	aio_proc_lock(p);
	827
	828	/*
	829	* cancel async IO requests on the todo work queue and wait for those
	830	* already active to complete.
	831	*/
	832	error = do_aio_cancel_locked(p, -1, USER_ADDR_NULL, AIO_EXIT_WAIT);
	833	ASSERT_AIO_PROC_LOCK_OWNED(p);
	834	if (error == AIO_NOTCANCELED) {
	835	/*
	836	* AIO_NOTCANCELED is returned when we find an aio request for this process
	837	* on the active async IO queue. Active requests cannot be cancelled so we
	838	* must wait for them to complete. We will get a special wake up call on
	839	* our channel used to sleep for ALL active requests to complete. This sleep
	840	* channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
	841	* active aio requests.
	842	*/
	843
	844	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep) \| DBG_FUNC_NONE,
	845	VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
	846
	847	while (aio_has_active_requests_for_process(p)) {
	848	msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0);
	849	}
	850	}
	851
	852	assert(!aio_has_active_requests_for_process(p));
	853
	854	/* release all aio resources used by this process */
	855	TAILQ_FOREACH_SAFE(entryp, &p->p_aio_doneq, aio_proc_link, tmp) {
	856	ASSERT_AIO_FROM_PROC(entryp, p);
	857
	858	aio_proc_remove_done_locked(p, entryp);
	859	TAILQ_INSERT_TAIL(&tofree, entryp, aio_proc_link);
	860	}
	861
	862	aio_proc_unlock(p);
	863
	864	/* free all the entries outside of the aio_proc_lock() */
	865	TAILQ_FOREACH_SAFE(entryp, &tofree, aio_proc_link, tmp) {
	866	entryp->aio_proc_link.tqe_prev = NULL;
	867	aio_entry_unref(entryp);
	868	}
	869
	870	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) \| DBG_FUNC_END,
	871	VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
	872	}
	873
	874
	875	static bool
	876	should_cancel(aio_workq_entry *entryp, int fd, user_addr_t aiocbp,
	877	aio_entry_flags_t reason)
	878	{
	879	if (reason & AIO_EXIT_WAIT) {
	880	/* caller is _aio_exit() */
	881	return true;
	882	}
	883	if (fd != entryp->aiocb.aio_fildes) {
	884	/* not the file we're looking for */
	885	return false;
	886	}
	887	/*
	888	* aio_cancel() or _aio_close() cancel
	889	* everything for a given fd when aiocbp is NULL
	890	*/
	891	return aiocbp == USER_ADDR_NULL \|\| entryp->uaiocbp == aiocbp;
	892	}
	893
	894	/*
	895	* do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
	896	* aio_cancel, close, and at exit.
	897	* There are three modes of operation: 1) cancel all async IOs for a process -
	898	* fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
	899	* is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
	900	* aiocbp.
	901	* Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
	902	* target async IO requests, AIO_NOTCANCELED if we could not cancel all
	903	* target async IO requests, and AIO_ALLDONE if all target async IO requests
	904	* were already complete.
	905	* WARNING - do not deference aiocbp in this routine, it may point to user
	906	* land data that has not been copied in (when called from aio_cancel())
	907	*
	908	* Called with proc locked, and returns the same way.
	909	*/
	910	static int
	911	do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
	912	aio_entry_flags_t reason)
	913	{
	914	bool multiple_matches = (aiocbp == USER_ADDR_NULL);
	915	aio_workq_entry entryp, tmp;
	916	int result;
	917
	918	ASSERT_AIO_PROC_LOCK_OWNED(p);
	919
	920	/* look for a match on our queue of async todo work. */
	921	again:
	922	result = -1;
	923	TAILQ_FOREACH_SAFE(entryp, &p->p_aio_activeq, aio_proc_link, tmp) {
	924	ASSERT_AIO_FROM_PROC(entryp, p);
	925
	926	if (!should_cancel(entryp, fd, aiocbp, reason)) {
	927	continue;
	928	}
	929
	930	if (reason) {
	931	/* mark the entry as blocking close or exit/exec */
	932	entryp->flags \|= reason;
	933	if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
	934	panic("Close and exit flags set at the same time\n");
	935	}
	936	}
	937
	938	/* Can only be cancelled if it's still on a work queue */
	939	if (aio_entry_try_workq_remove(entryp)) {
	940	entryp->errorval = ECANCELED;
	941	entryp->returnval = -1;
	942
	943	/* Now it's officially cancelled. Do the completion */
	944	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) \| DBG_FUNC_NONE,
	945	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	946	fd, 0, 0);
	947	do_aio_completion_and_unlock(p, entryp);
	948
	949	aio_proc_lock(p);
	950
	951	if (multiple_matches) {
	952	/*
	953	* Restart from the head of the proc active queue since it
	954	* may have been changed while we were away doing completion
	955	* processing.
	956	*
	957	* Note that if we found an uncancellable AIO before, we will
	958	* either find it again or discover that it's been completed,
	959	* so resetting the result will not cause us to return success
	960	* despite outstanding AIOs.
	961	*/
	962	goto again;
	963	}
	964
	965	return AIO_CANCELED;
	966	}
	967
	968	/*
	969	* It's been taken off the active queue already, i.e. is in flight.
	970	* All we can do is ask for notification.
	971	*/
	972	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq) \| DBG_FUNC_NONE,
	973	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	974	fd, 0, 0);
	975
	976	result = AIO_NOTCANCELED;
	977	if (!multiple_matches) {
	978	return result;
	979	}
	980	}
	981
	982	/*
	983	* if we didn't find any matches on the todo or active queues then look for a
	984	* match on our queue of async IO requests that have completed and if found
	985	* return AIO_ALLDONE result.
	986	*
	987	* Proc AIO lock is still held.
	988	*/
	989	if (result == -1) {
	990	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
	991	ASSERT_AIO_FROM_PROC(entryp, p);
	992	if (should_cancel(entryp, fd, aiocbp, reason)) {
	993	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq) \| DBG_FUNC_NONE,
	994	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	995	fd, 0, 0);
	996
	997	result = AIO_ALLDONE;
	998	if (!multiple_matches) {
	999	return result;
	1000	}
	1001	}
	1002	}
	1003	}
	1004
	1005	return result;
	1006	}
	1007
	1008
	1009	/*
	1010	* aio_suspend - suspend the calling thread until at least one of the async
	1011	* IO operations referenced by uap->aiocblist has completed, until a signal
	1012	* interrupts the function, or uap->timeoutp time interval (optional) has
	1013	* passed.
	1014	* Returns 0 if one or more async IOs have completed else -1 and errno is
	1015	* set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
	1016	* woke us up.
	1017	*/
	1018	int
	1019	aio_suspend(proc_t p, struct aio_suspend_args uap, int retval)
	1020	{
	1021	__pthread_testcancel(1);
	1022	return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
	1023	}
	1024
	1025
	1026	int
	1027	aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args uap, int retval)
	1028	{
	1029	int error;
	1030	int i;
	1031	uint64_t abstime;
	1032	struct user_timespec ts;
	1033	aio_workq_entry *entryp;
	1034	user_addr_t *aiocbpp;
	1035	size_t aiocbpp_size;
	1036
	1037	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) \| DBG_FUNC_START,
	1038	VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
	1039
	1040	*retval = -1;
	1041	abstime = 0;
	1042	aiocbpp = NULL;
	1043
	1044	if (!aio_has_any_work()) {
	1045	error = EINVAL;
	1046	goto ExitThisRoutine;
	1047	}
	1048
	1049	if (uap->nent < 1 \|\| uap->nent > aio_max_requests_per_process \|\|
	1050	os_mul_overflow(sizeof(user_addr_t), uap->nent, &aiocbpp_size)) {
	1051	error = EINVAL;
	1052	goto ExitThisRoutine;
	1053	}
	1054
	1055	if (uap->timeoutp != USER_ADDR_NULL) {
	1056	if (proc_is64bit(p)) {
	1057	struct user64_timespec temp;
	1058	error = copyin(uap->timeoutp, &temp, sizeof(temp));
	1059	if (error == 0) {
	1060	ts.tv_sec = (user_time_t)temp.tv_sec;
	1061	ts.tv_nsec = (user_long_t)temp.tv_nsec;
	1062	}
	1063	} else {
	1064	struct user32_timespec temp;
	1065	error = copyin(uap->timeoutp, &temp, sizeof(temp));
	1066	if (error == 0) {
	1067	ts.tv_sec = temp.tv_sec;
	1068	ts.tv_nsec = temp.tv_nsec;
	1069	}
	1070	}
	1071	if (error != 0) {
	1072	error = EAGAIN;
	1073	goto ExitThisRoutine;
	1074	}
	1075
	1076	if (ts.tv_sec < 0 \|\| ts.tv_nsec < 0 \|\| ts.tv_nsec >= 1000000000) {
	1077	error = EINVAL;
	1078	goto ExitThisRoutine;
	1079	}
	1080
	1081	nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
	1082	&abstime);
	1083	clock_absolutetime_interval_to_deadline(abstime, &abstime);
	1084	}
	1085
	1086	aiocbpp = kheap_alloc(KHEAP_TEMP, aiocbpp_size, Z_WAITOK);
	1087	if (aiocbpp == NULL \|\| aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
	1088	error = EAGAIN;
	1089	goto ExitThisRoutine;
	1090	}
	1091
	1092	/* check list of aio requests to see if any have completed */
	1093	check_for_our_aiocbp:
	1094	aio_proc_lock_spin(p);
	1095	for (i = 0; i < uap->nent; i++) {
	1096	user_addr_t aiocbp;
	1097
	1098	/* NULL elements are legal so check for 'em */
	1099	aiocbp = *(aiocbpp + i);
	1100	if (aiocbp == USER_ADDR_NULL) {
	1101	continue;
	1102	}
	1103
	1104	/* return immediately if any aio request in the list is done */
	1105	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
	1106	ASSERT_AIO_FROM_PROC(entryp, p);
	1107	if (entryp->uaiocbp == aiocbp) {
	1108	aio_proc_unlock(p);
	1109	*retval = 0;
	1110	error = 0;
	1111	goto ExitThisRoutine;
	1112	}
	1113	}
	1114	}
	1115
	1116	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep) \| DBG_FUNC_NONE,
	1117	VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
	1118
	1119	/*
	1120	* wait for an async IO to complete or a signal fires or timeout expires.
	1121	* we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
	1122	* interrupts us. If an async IO completes before a signal fires or our
	1123	* timeout expires, we get a wakeup call from aio_work_thread().
	1124	*/
	1125
	1126	error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p),
	1127	PCATCH \| PWAIT \| PDROP, "aio_suspend", abstime);
	1128	if (error == 0) {
	1129	/*
	1130	* got our wakeup call from aio_work_thread().
	1131	* Since we can get a wakeup on this channel from another thread in the
	1132	* same process we head back up to make sure this is for the correct aiocbp.
	1133	* If it is the correct aiocbp we will return from where we do the check
	1134	* (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
	1135	* else we will fall out and just sleep again.
	1136	*/
	1137	goto check_for_our_aiocbp;
	1138	} else if (error == EWOULDBLOCK) {
	1139	/* our timeout expired */
	1140	error = EAGAIN;
	1141	} else {
	1142	/* we were interrupted */
	1143	error = EINTR;
	1144	}
	1145
	1146	ExitThisRoutine:
	1147	if (aiocbpp != NULL) {
	1148	kheap_free(KHEAP_TEMP, aiocbpp, aiocbpp_size);
	1149	}
	1150
	1151	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) \| DBG_FUNC_END,
	1152	VM_KERNEL_ADDRPERM(p), uap->nent, error, 0, 0);
	1153
	1154	return error;
	1155	}
	1156
	1157
	1158	/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
	1159	* file descriptor (uap->aiocbp->aio_fildes) from the buffer
	1160	* (uap->aiocbp->aio_buf).
	1161	*/
	1162
	1163	int
	1164	aio_write(proc_t p, struct aio_write_args uap, int retval __unused)
	1165	{
	1166	int error;
	1167
	1168	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) \| DBG_FUNC_START,
	1169	VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
	1170
	1171	error = aio_queue_async_request(p, uap->aiocbp, AIO_WRITE);
	1172
	1173	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) \| DBG_FUNC_END,
	1174	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
	1175
	1176	return error;
	1177	}
	1178
	1179
	1180	static int
	1181	aio_copy_in_list(proc_t procp, user_addr_t aiocblist, user_addr_t *aiocbpp,
	1182	int nent)
	1183	{
	1184	int result;
	1185
	1186	/* copyin our aiocb pointers from list */
	1187	result = copyin(aiocblist, aiocbpp,
	1188	proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
	1189	: (nent * sizeof(user32_addr_t)));
	1190	if (result) {
	1191	return result;
	1192	}
	1193
	1194	/*
	1195	* We depend on a list of user_addr_t's so we need to
	1196	* munge and expand when these pointers came from a
	1197	* 32-bit process
	1198	*/
	1199	if (!proc_is64bit(procp)) {
	1200	/* copy from last to first to deal with overlap */
	1201	user32_addr_t my_ptrp = ((user32_addr_t )aiocbpp) + (nent - 1);
	1202	user_addr_t *my_addrp = aiocbpp + (nent - 1);
	1203
	1204	for (int i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
	1205	my_addrp = (user_addr_t) (my_ptrp);
	1206	}
	1207	}
	1208
	1209	return 0;
	1210	}
	1211
	1212
	1213	static int
	1214	aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
	1215	{
	1216	int result = 0;
	1217
	1218	if (sigp == USER_ADDR_NULL) {
	1219	goto out;
	1220	}
	1221
	1222	/*
	1223	* We need to munge aio_sigevent since it contains pointers.
	1224	* Since we do not know if sigev_value is an int or a ptr we do
	1225	* NOT cast the ptr to a user_addr_t. This means if we send
	1226	* this info back to user space we need to remember sigev_value
	1227	* was not expanded for the 32-bit case.
	1228	*
	1229	* Notes: This does NOT affect us since we don't support
	1230	* sigev_value yet in the aio context.
	1231	*/
	1232	if (proc_is64bit(procp)) {
	1233	#if __LP64__
	1234	struct user64_sigevent sigevent64;
	1235
	1236	result = copyin(sigp, &sigevent64, sizeof(sigevent64));
	1237	if (result == 0) {
	1238	sigev->sigev_notify = sigevent64.sigev_notify;
	1239	sigev->sigev_signo = sigevent64.sigev_signo;
	1240	sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
	1241	sigev->sigev_notify_function = sigevent64.sigev_notify_function;
	1242	sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
	1243	}
	1244	#else
	1245	panic("64bit process on 32bit kernel is not supported");
	1246	#endif
	1247	} else {
	1248	struct user32_sigevent sigevent32;
	1249
	1250	result = copyin(sigp, &sigevent32, sizeof(sigevent32));
	1251	if (result == 0) {
	1252	sigev->sigev_notify = sigevent32.sigev_notify;
	1253	sigev->sigev_signo = sigevent32.sigev_signo;
	1254	sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
	1255	sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
	1256	sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
	1257	}
	1258	}
	1259
	1260	if (result != 0) {
	1261	result = EAGAIN;
	1262	}
	1263
	1264	out:
	1265	return result;
	1266	}
	1267
	1268	/*
	1269	* validate user_sigevent. at this point we only support
	1270	* sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
	1271	* sigev_value, sigev_notify_function, and sigev_notify_attributes
	1272	* are ignored, since SIGEV_THREAD is unsupported. This is consistent
	1273	* with no [RTS] (RalTime Signal) option group support.
	1274	*/
	1275	static int
	1276	aio_sigev_validate(const struct user_sigevent *sigev)
	1277	{
	1278	switch (sigev->sigev_notify) {
	1279	case SIGEV_SIGNAL:
	1280	{
	1281	int signum;
	1282
	1283	/* make sure we have a valid signal number */
	1284	signum = sigev->sigev_signo;
	1285	if (signum <= 0 \|\| signum >= NSIG \|\|
	1286	signum == SIGKILL \|\| signum == SIGSTOP) {
	1287	return EINVAL;
	1288	}
	1289	}
	1290	break;
	1291
	1292	case SIGEV_NONE:
	1293	break;
	1294
	1295	case SIGEV_THREAD:
	1296	/* Unsupported [RTS] */
	1297
	1298	default:
	1299	return EINVAL;
	1300	}
	1301
	1302	return 0;
	1303	}
	1304
	1305
	1306	/*
	1307	* aio_try_enqueue_work_locked
	1308	*
	1309	* Queue up the entry on the aio asynchronous work queue in priority order
	1310	* based on the relative priority of the request. We calculate the relative
	1311	* priority using the nice value of the caller and the value
	1312	*
	1313	* Parameters: procp Process queueing the I/O
	1314	* entryp The work queue entry being queued
	1315	* leader The work leader if any
	1316	*
	1317	* Returns: Wether the enqueue was successful
	1318	*
	1319	* Notes: This function is used for both lio_listio and aio
	1320	*
	1321	* XXX: At some point, we may have to consider thread priority
	1322	* rather than process priority, but we don't maintain the
	1323	* adjusted priority for threads the POSIX way.
	1324	*
	1325	* Called with proc locked.
	1326	*/
	1327	static bool
	1328	aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp,
	1329	aio_workq_entry *leader)
	1330	{
	1331	aio_workq_t queue = aio_entry_workq(entryp);
	1332
	1333	ASSERT_AIO_PROC_LOCK_OWNED(procp);
	1334
	1335	/* Onto proc queue */
	1336	if (!aio_try_proc_insert_active_locked(procp, entryp)) {
	1337	return false;
	1338	}
	1339
	1340	if (leader) {
	1341	aio_entry_ref(leader); /* consumed in do_aio_completion_and_unlock */
	1342	leader->lio_pending++;
	1343	entryp->lio_leader = leader;
	1344	}
	1345
	1346	/* And work queue */
	1347	aio_entry_ref(entryp); /* consumed in do_aio_completion_and_unlock */
	1348	aio_workq_lock_spin(queue);
	1349	aio_workq_add_entry_locked(queue, entryp);
	1350	waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
	1351	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
	1352	aio_workq_unlock(queue);
	1353
	1354	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) \| DBG_FUNC_START,
	1355	VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	1356	entryp->flags, entryp->aiocb.aio_fildes, 0);
	1357	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) \| DBG_FUNC_END,
	1358	entryp->aiocb.aio_offset, 0, entryp->aiocb.aio_nbytes, 0, 0);
	1359	return true;
	1360	}
	1361
	1362
	1363	/*
	1364	* lio_listio - initiate a list of IO requests. We process the list of
	1365	* aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
	1366	* (mode == LIO_NOWAIT).
	1367	*
	1368	* The caller gets error and return status for each aiocb in the list
	1369	* via aio_error and aio_return. We must keep completed requests until
	1370	* released by the aio_return call.
	1371	*/
	1372	int
	1373	lio_listio(proc_t p, struct lio_listio_args uap, int retval __unused)
	1374	{
	1375	aio_workq_entry *entries[AIO_LISTIO_MAX] = { };
	1376	user_addr_t aiocbpp[AIO_LISTIO_MAX];
	1377	struct user_sigevent aiosigev = { };
	1378	int result = 0;
	1379	int lio_count = 0;
	1380
	1381	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) \| DBG_FUNC_START,
	1382	VM_KERNEL_ADDRPERM(p), uap->nent, uap->mode, 0, 0);
	1383
	1384	if (!(uap->mode == LIO_NOWAIT \|\| uap->mode == LIO_WAIT)) {
	1385	result = EINVAL;
	1386	goto ExitRoutine;
	1387	}
	1388
	1389	if (uap->nent < 1 \|\| uap->nent > AIO_LISTIO_MAX) {
	1390	result = EINVAL;
	1391	goto ExitRoutine;
	1392	}
	1393
	1394	/*
	1395	* Use sigevent passed in to lio_listio for each of our calls, but
	1396	* only do completion notification after the last request completes.
	1397	*/
	1398	if (uap->sigp != USER_ADDR_NULL) {
	1399	result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
	1400	if (result) {
	1401	goto ExitRoutine;
	1402	}
	1403	result = aio_sigev_validate(&aiosigev);
	1404	if (result) {
	1405	goto ExitRoutine;
	1406	}
	1407	}
	1408
	1409	if (aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
	1410	result = EAGAIN;
	1411	goto ExitRoutine;
	1412	}
	1413
	1414	/*
	1415	* allocate/parse all entries
	1416	*/
	1417	for (int i = 0; i < uap->nent; i++) {
	1418	aio_workq_entry *entryp;
	1419
	1420	/* NULL elements are legal so check for 'em */
	1421	if (aiocbpp[i] == USER_ADDR_NULL) {
	1422	continue;
	1423	}
	1424
	1425	entryp = aio_create_queue_entry(p, aiocbpp[i], AIO_LIO);
	1426	if (entryp == NULL) {
	1427	result = EAGAIN;
	1428	goto ExitRoutine;
	1429	}
	1430
	1431	/*
	1432	* This refcount is cleaned up on exit if the entry
	1433	* isn't submitted
	1434	*/
	1435	entries[lio_count++] = entryp;
	1436	if (uap->mode == LIO_NOWAIT) {
	1437	/* Set signal hander, if any */
	1438	entryp->aiocb.aio_sigevent = aiosigev;
	1439	}
	1440	}
	1441
	1442	if (lio_count == 0) {
	1443	/* There's nothing to submit */
	1444	goto ExitRoutine;
	1445	}
	1446
	1447	/*
	1448	* Past this point we're commited and will not bail out
	1449	*
	1450	* - keep a reference on the leader for LIO_WAIT
	1451	* - perform the submissions and optionally wait
	1452	*/
	1453
	1454	aio_workq_entry *leader = entries[0];
	1455	if (uap->mode == LIO_WAIT) {
	1456	aio_entry_ref(leader); /* consumed below */
	1457	}
	1458
	1459	aio_proc_lock_spin(p);
	1460
	1461	for (int i = 0; i < lio_count; i++) {
	1462	if (aio_try_enqueue_work_locked(p, entries[i], leader)) {
	1463	entries[i] = NULL; /* the entry was submitted */
	1464	} else {
	1465	result = EAGAIN;
	1466	}
	1467	}
	1468
	1469	if (uap->mode == LIO_WAIT && result == 0) {
	1470	leader->flags \|= AIO_LIO_WAIT;
	1471
	1472	while (leader->lio_pending) {
	1473	/* If we were interrupted, fail out (even if all finished) */
	1474	if (msleep(leader, aio_proc_mutex(p),
	1475	PCATCH \| PRIBIO \| PSPIN, "lio_listio", 0) != 0) {
	1476	result = EINTR;
	1477	break;
	1478	}
	1479	}
	1480
	1481	leader->flags &= ~AIO_LIO_WAIT;
	1482	}
	1483
	1484	aio_proc_unlock(p);
	1485
	1486	if (uap->mode == LIO_WAIT) {
	1487	aio_entry_unref(leader);
	1488	}
	1489
	1490	ExitRoutine:
	1491	/* Consume unsubmitted entries */
	1492	for (int i = 0; i < lio_count; i++) {
	1493	if (entries[i]) {
	1494	aio_entry_unref(entries[i]);
	1495	}
	1496	}
	1497
	1498	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) \| DBG_FUNC_END,
	1499	VM_KERNEL_ADDRPERM(p), result, 0, 0, 0);
	1500
	1501	return result;
	1502	}
	1503
	1504
	1505	/*
	1506	* aio worker thread. this is where all the real work gets done.
	1507	* we get a wake up call on sleep channel &aio_anchor.aio_async_workq
	1508	* after new work is queued up.
	1509	*/
	1510	__attribute__((noreturn))
	1511	static void
	1512	aio_work_thread(void *arg __unused, wait_result_t wr __unused)
	1513	{
	1514	aio_workq_entry *entryp;
	1515	int error;
	1516	vm_map_t currentmap;
	1517	vm_map_t oldmap = VM_MAP_NULL;
	1518	task_t oldaiotask = TASK_NULL;
	1519	struct uthread *uthreadp = NULL;
	1520	proc_t p = NULL;
	1521
	1522	for (;;) {
	1523	/*
	1524	* returns with the entry ref'ed.
	1525	* sleeps until work is available.
	1526	*/
	1527	entryp = aio_get_some_work();
	1528	p = entryp->procp;
	1529
	1530	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) \| DBG_FUNC_START,
	1531	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	1532	entryp->flags, 0, 0);
	1533
	1534	/*
	1535	* Assume the target's address space identity for the duration
	1536	* of the IO. Note: don't need to have the entryp locked,
	1537	* because the proc and map don't change until it's freed.
	1538	*/
	1539	currentmap = get_task_map((current_proc())->task);
	1540	if (currentmap != entryp->aio_map) {
	1541	uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
	1542	oldaiotask = uthreadp->uu_aio_task;
	1543	/*
	1544	* workq entries at this stage cause _aio_exec() and _aio_exit() to
	1545	* block until we hit `do_aio_completion_and_unlock()` below,
	1546	* which means that it is safe to dereference p->task without
	1547	* holding a lock or taking references.
	1548	*/
	1549	uthreadp->uu_aio_task = p->task;
	1550	oldmap = vm_map_switch(entryp->aio_map);
	1551	}
	1552
	1553	if ((entryp->flags & AIO_READ) != 0) {
	1554	error = do_aio_read(entryp);
	1555	} else if ((entryp->flags & AIO_WRITE) != 0) {
	1556	error = do_aio_write(entryp);
	1557	} else if ((entryp->flags & (AIO_FSYNC \| AIO_DSYNC)) != 0) {
	1558	error = do_aio_fsync(entryp);
	1559	} else {
	1560	error = EINVAL;
	1561	}
	1562
	1563	/* Restore old map */
	1564	if (currentmap != entryp->aio_map) {
	1565	vm_map_switch(oldmap);
	1566	uthreadp->uu_aio_task = oldaiotask;
	1567	}
	1568
	1569	/* liberate unused map */
	1570	vm_map_deallocate(entryp->aio_map);
	1571	entryp->aio_map = VM_MAP_NULL;
	1572
	1573	KERNEL_DEBUG(SDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) \| DBG_FUNC_END,
	1574	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	1575	entryp->errorval, entryp->returnval, 0);
	1576
	1577	/* we're done with the IO request so pop it off the active queue and */
	1578	/* push it on the done queue */
	1579	aio_proc_lock(p);
	1580	entryp->errorval = error;
	1581	do_aio_completion_and_unlock(p, entryp);
	1582	}
	1583	}
	1584
	1585
	1586	/*
	1587	* aio_get_some_work - get the next async IO request that is ready to be executed.
	1588	* aio_fsync complicates matters a bit since we cannot do the fsync until all async
	1589	* IO requests at the time the aio_fsync call came in have completed.
	1590	* NOTE - AIO_LOCK must be held by caller
	1591	*/
	1592	static aio_workq_entry *
	1593	aio_get_some_work(void)
	1594	{
	1595	aio_workq_entry *entryp = NULL;
	1596	aio_workq_t queue = NULL;
	1597
	1598	/* Just one queue for the moment. In the future there will be many. */
	1599	queue = &aio_anchor.aio_async_workqs[0];
	1600	aio_workq_lock_spin(queue);
	1601
	1602	/*
	1603	* Hold the queue lock.
	1604	*
	1605	* pop some work off the work queue and add to our active queue
	1606	* Always start with the queue lock held.
	1607	*/
	1608	while ((entryp = TAILQ_FIRST(&queue->aioq_entries))) {
	1609	/*
	1610	* Pull of of work queue. Once it's off, it can't be cancelled,
	1611	* so we can take our ref once we drop the queue lock.
	1612	*/
	1613
	1614	aio_workq_remove_entry_locked(queue, entryp);
	1615
	1616	aio_workq_unlock(queue);
	1617
	1618	/*
	1619	* Check if it's an fsync that must be delayed. No need to lock the entry;
	1620	* that flag would have been set at initialization.
	1621	*/
	1622	if ((entryp->flags & AIO_FSYNC) != 0) {
	1623	/*
	1624	* Check for unfinished operations on the same file
	1625	* in this proc's queue.
	1626	*/
	1627	aio_proc_lock_spin(entryp->procp);
	1628	if (aio_delay_fsync_request(entryp)) {
	1629	/* It needs to be delayed. Put it back on the end of the work queue */
	1630	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) \| DBG_FUNC_NONE,
	1631	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	1632	0, 0, 0);
	1633
	1634	aio_proc_unlock(entryp->procp);
	1635
	1636	aio_workq_lock_spin(queue);
	1637	aio_workq_add_entry_locked(queue, entryp);
	1638	continue;
	1639	}
	1640	aio_proc_unlock(entryp->procp);
	1641	}
	1642
	1643	return entryp;
	1644	}
	1645
	1646	/* We will wake up when someone enqueues something */
	1647	waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
	1648	aio_workq_unlock(queue);
	1649	thread_block(aio_work_thread);
	1650
	1651	__builtin_unreachable();
	1652	}
	1653
	1654	/*
	1655	* aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
	1656	* A big, simple hammer: only send it off if it's the most recently filed IO which has
	1657	* not been completed.
	1658	*/
	1659	static boolean_t
	1660	aio_delay_fsync_request(aio_workq_entry *entryp)
	1661	{
	1662	if (proc_in_teardown(entryp->procp)) {
	1663	/*
	1664	* we can't delay FSYNCS when in teardown as it will confuse _aio_exit,
	1665	* if it was dequeued, then we must now commit to it
	1666	*/
	1667	return FALSE;
	1668	}
	1669
	1670	if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
	1671	return FALSE;
	1672	}
	1673
	1674	return TRUE;
	1675	}
	1676
	1677	static aio_workq_entry *
	1678	aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t flags)
	1679	{
	1680	aio_workq_entry *entryp;
	1681
	1682	entryp = zalloc_flags(aio_workq_zonep, Z_WAITOK \| Z_ZERO);
	1683	entryp->procp = procp;
	1684	entryp->uaiocbp = aiocbp;
	1685	entryp->flags = flags;
	1686	/* consumed in aio_return or _aio_exit */
	1687	os_ref_init(&entryp->aio_refcount, &aio_refgrp);
	1688
	1689	if (proc_is64bit(procp)) {
	1690	struct user64_aiocb aiocb64;
	1691
	1692	if (copyin(aiocbp, &aiocb64, sizeof(aiocb64)) != 0) {
	1693	goto error_exit;
	1694	}
	1695	do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
	1696	} else {
	1697	struct user32_aiocb aiocb32;
	1698
	1699	if (copyin(aiocbp, &aiocb32, sizeof(aiocb32)) != 0) {
	1700	goto error_exit;
	1701	}
	1702	do_munge_aiocb_user32_to_user(&aiocb32, &entryp->aiocb);
	1703	}
	1704
	1705	/* do some more validation on the aiocb and embedded file descriptor */
	1706	if (aio_validate(procp, entryp) != 0) {
	1707	goto error_exit;
	1708	}
	1709
	1710	/* get a reference to the user land map in order to keep it around */
	1711	entryp->aio_map = get_task_map(procp->task);
	1712	vm_map_reference(entryp->aio_map);
	1713
	1714	/* get a reference on the current_thread, which is passed in vfs_context. */
	1715	entryp->thread = current_thread();
	1716	thread_reference(entryp->thread);
	1717	return entryp;
	1718
	1719	error_exit:
	1720	zfree(aio_workq_zonep, entryp);
	1721	return NULL;
	1722	}
	1723
	1724
	1725	/*
	1726	* aio_queue_async_request - queue up an async IO request on our work queue then
	1727	* wake up one of our worker threads to do the actual work. We get a reference
	1728	* to our caller's user land map in order to keep it around while we are
	1729	* processing the request.
	1730	*/
	1731	static int
	1732	aio_queue_async_request(proc_t procp, user_addr_t aiocbp,
	1733	aio_entry_flags_t flags)
	1734	{
	1735	aio_workq_entry *entryp;
	1736	int result;
	1737
	1738	entryp = aio_create_queue_entry(procp, aiocbp, flags);
	1739	if (entryp == NULL) {
	1740	result = EAGAIN;
	1741	goto error_noalloc;
	1742	}
	1743
	1744	aio_proc_lock_spin(procp);
	1745	if (!aio_try_enqueue_work_locked(procp, entryp, NULL)) {
	1746	result = EAGAIN;
	1747	goto error_exit;
	1748	}
	1749	aio_proc_unlock(procp);
	1750	return 0;
	1751
	1752	error_exit:
	1753	/*
	1754	* This entry has not been queued up so no worries about
	1755	* unlocked state and aio_map
	1756	*/
	1757	aio_proc_unlock(procp);
	1758	aio_free_request(entryp);
	1759	error_noalloc:
	1760	return result;
	1761	}
	1762
	1763
	1764	/*
	1765	* aio_free_request - remove our reference on the user land map and
	1766	* free the work queue entry resources. The entry is off all lists
	1767	* and has zero refcount, so no one can have a pointer to it.
	1768	*/
	1769	static void
	1770	aio_free_request(aio_workq_entry *entryp)
	1771	{
	1772	if (entryp->aio_proc_link.tqe_prev \|\| entryp->aio_workq_link.tqe_prev) {
	1773	panic("aio_workq_entry %p being freed while still enqueued", entryp);
	1774	}
	1775
	1776	/* remove our reference to the user land map. */
	1777	if (VM_MAP_NULL != entryp->aio_map) {
	1778	vm_map_deallocate(entryp->aio_map);
	1779	}
	1780
	1781	/* remove our reference to thread which enqueued the request */
	1782	if (NULL != entryp->thread) {
	1783	thread_deallocate(entryp->thread);
	1784	}
	1785
	1786	zfree(aio_workq_zonep, entryp);
	1787	}
	1788
	1789
	1790	/*
	1791	* aio_validate
	1792	*
	1793	* validate the aiocb passed in by one of the aio syscalls.
	1794	*/
	1795	static int
	1796	aio_validate(proc_t p, aio_workq_entry *entryp)
	1797	{
	1798	struct fileproc *fp;
	1799	int flag;
	1800	int result;
	1801
	1802	result = 0;
	1803
	1804	if ((entryp->flags & AIO_LIO) != 0) {
	1805	if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
	1806	entryp->flags \|= AIO_READ;
	1807	} else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
	1808	entryp->flags \|= AIO_WRITE;
	1809	} else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
	1810	return 0;
	1811	} else {
	1812	return EINVAL;
	1813	}
	1814	}
	1815
	1816	flag = FREAD;
	1817	if ((entryp->flags & (AIO_WRITE \| AIO_FSYNC \| AIO_DSYNC)) != 0) {
	1818	flag = FWRITE;
	1819	}
	1820
	1821	if ((entryp->flags & (AIO_READ \| AIO_WRITE)) != 0) {
	1822	if (entryp->aiocb.aio_nbytes > INT_MAX \|\|
	1823	entryp->aiocb.aio_buf == USER_ADDR_NULL \|\|
	1824	entryp->aiocb.aio_offset < 0) {
	1825	return EINVAL;
	1826	}
	1827	}
	1828
	1829	result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
	1830	if (result) {
	1831	return result;
	1832	}
	1833
	1834	/* validate the file descriptor and that the file was opened
	1835	* for the appropriate read / write access.
	1836	*/
	1837	proc_fdlock(p);
	1838
	1839	fp = fp_get_noref_locked(p, entryp->aiocb.aio_fildes);
	1840	if (fp == NULL) {
	1841	result = EBADF;
	1842	} else if ((fp->fp_glob->fg_flag & flag) == 0) {
	1843	/* we don't have read or write access */
	1844	result = EBADF;
	1845	} else if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_VNODE) {
	1846	/* this is not a file */
	1847	result = ESPIPE;
	1848	} else {
	1849	fp->fp_flags \|= FP_AIOISSUED;
	1850	}
	1851
	1852	proc_fdunlock(p);
	1853
	1854	return result;
	1855	}
	1856
	1857	/*
	1858	* do_aio_completion_and_unlock. Handle async IO completion.
	1859	*/
	1860	static void
	1861	do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp)
	1862	{
	1863	aio_workq_entry *leader = entryp->lio_leader;
	1864	int lio_pending = 0;
	1865	bool do_signal = false;
	1866
	1867	ASSERT_AIO_PROC_LOCK_OWNED(p);
	1868
	1869	aio_proc_move_done_locked(p, entryp);
	1870
	1871	if (leader) {
	1872	lio_pending = --leader->lio_pending;
	1873	if (lio_pending < 0) {
	1874	panic("lio_pending accounting mistake");
	1875	}
	1876	if (lio_pending == 0 && (leader->flags & AIO_LIO_WAIT)) {
	1877	wakeup(leader);
	1878	}
	1879	entryp->lio_leader = NULL; /* no dangling pointers please */
	1880	}
	1881
	1882	/*
	1883	* need to handle case where a process is trying to exit, exec, or
	1884	* close and is currently waiting for active aio requests to complete.
	1885	* If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
	1886	* other requests in the active queue for this process. If there are
	1887	* none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
	1888	* If there are some still active then do nothing - we only want to
	1889	* wakeup when all active aio requests for the process are complete.
	1890	*/
	1891	if (__improbable(entryp->flags & AIO_EXIT_WAIT)) {
	1892	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) \| DBG_FUNC_NONE,
	1893	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	1894	0, 0, 0);
	1895
	1896	if (!aio_has_active_requests_for_process(p)) {
	1897	/*
	1898	* no active aio requests for this process, continue exiting. In this
	1899	* case, there should be no one else waiting ont he proc in AIO...
	1900	*/
	1901	wakeup_one((caddr_t)&p->AIO_CLEANUP_SLEEP_CHAN);
	1902
	1903	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) \| DBG_FUNC_NONE,
	1904	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	1905	0, 0, 0);
	1906	}
	1907	} else if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
	1908	/*
	1909	* If this was the last request in the group, or not part of
	1910	* a group, and that a signal is desired, send one.
	1911	*/
	1912	do_signal = (lio_pending == 0);
	1913	}
	1914
	1915	if (__improbable(entryp->flags & AIO_CLOSE_WAIT)) {
	1916	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) \| DBG_FUNC_NONE,
	1917	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	1918	0, 0, 0);
	1919
	1920	if (!aio_proc_has_active_requests_for_file(p, entryp->aiocb.aio_fildes)) {
	1921	/* Can't wakeup_one(); multiple closes might be in progress. */
	1922	wakeup(&p->AIO_CLEANUP_SLEEP_CHAN);
	1923
	1924	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) \| DBG_FUNC_NONE,
	1925	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	1926	0, 0, 0);
	1927	}
	1928	}
	1929
	1930	aio_proc_unlock(p);
	1931
	1932	if (do_signal) {
	1933	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig) \| DBG_FUNC_NONE,
	1934	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
	1935	entryp->aiocb.aio_sigevent.sigev_signo, 0, 0);
	1936
	1937	psignal(p, entryp->aiocb.aio_sigevent.sigev_signo);
	1938	}
	1939
	1940	/*
	1941	* A thread in aio_suspend() wants to known about completed IOs. If it checked
	1942	* the done list before we moved our AIO there, then it already asserted its wait,
	1943	* and we can wake it up without holding the lock. If it checked the list after
	1944	* we did our move, then it already has seen the AIO that we moved. Herego, we
	1945	* can do our wakeup without holding the lock.
	1946	*/
	1947	wakeup(&p->AIO_SUSPEND_SLEEP_CHAN);
	1948	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake) \| DBG_FUNC_NONE,
	1949	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), 0, 0, 0);
	1950
	1951	aio_entry_unref(entryp); /* see aio_try_enqueue_work_locked */
	1952	if (leader) {
	1953	aio_entry_unref(leader); /* see lio_listio */
	1954	}
	1955	}
	1956
	1957
	1958	/*
	1959	* do_aio_read
	1960	*/
	1961	static int
	1962	do_aio_read(aio_workq_entry *entryp)
	1963	{
	1964	struct proc *p = entryp->procp;
	1965	struct fileproc *fp;
	1966	int error;
	1967
	1968	if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
	1969	return error;
	1970	}
	1971
	1972	if (fp->fp_glob->fg_flag & FREAD) {
	1973	struct vfs_context context = {
	1974	.vc_thread = entryp->thread, /* XXX */
	1975	.vc_ucred = fp->fp_glob->fg_cred,
	1976	};
	1977
	1978	error = dofileread(&context, fp,
	1979	entryp->aiocb.aio_buf,
	1980	entryp->aiocb.aio_nbytes,
	1981	entryp->aiocb.aio_offset, FOF_OFFSET,
	1982	&entryp->returnval);
	1983	} else {
	1984	error = EBADF;
	1985	}
	1986
	1987	fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
	1988	return error;
	1989	}
	1990
	1991
	1992	/*
	1993	* do_aio_write
	1994	*/
	1995	static int
	1996	do_aio_write(aio_workq_entry *entryp)
	1997	{
	1998	struct proc *p = entryp->procp;
	1999	struct fileproc *fp;
	2000	int error;
	2001
	2002	if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
	2003	return error;
	2004	}
	2005
	2006	if (fp->fp_glob->fg_flag & FWRITE) {
	2007	struct vfs_context context = {
	2008	.vc_thread = entryp->thread, /* XXX */
	2009	.vc_ucred = fp->fp_glob->fg_cred,
	2010	};
	2011	int flags = FOF_PCRED;
	2012
	2013	if ((fp->fp_glob->fg_flag & O_APPEND) == 0) {
	2014	flags \|= FOF_OFFSET;
	2015	}
	2016
	2017	/* NB: tell dofilewrite the offset, and to use the proc cred */
	2018	error = dofilewrite(&context,
	2019	fp,
	2020	entryp->aiocb.aio_buf,
	2021	entryp->aiocb.aio_nbytes,
	2022	entryp->aiocb.aio_offset,
	2023	flags,
	2024	&entryp->returnval);
	2025	} else {
	2026	error = EBADF;
	2027	}
	2028
	2029	fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
	2030	return error;
	2031	}
	2032
	2033
	2034	/*
	2035	* aio_has_active_requests_for_process - return whether the process has active
	2036	* requests pending.
	2037	*/
	2038	static bool
	2039	aio_has_active_requests_for_process(proc_t procp)
	2040	{
	2041	return !TAILQ_EMPTY(&procp->p_aio_activeq);
	2042	}
	2043
	2044	/*
	2045	* Called with the proc locked.
	2046	*/
	2047	static bool
	2048	aio_proc_has_active_requests_for_file(proc_t procp, int fd)
	2049	{
	2050	aio_workq_entry *entryp;
	2051
	2052	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
	2053	if (entryp->aiocb.aio_fildes == fd) {
	2054	return true;
	2055	}
	2056	}
	2057
	2058	return false;
	2059	}
	2060
	2061
	2062	/*
	2063	* do_aio_fsync
	2064	*/
	2065	static int
	2066	do_aio_fsync(aio_workq_entry *entryp)
	2067	{
	2068	struct proc *p = entryp->procp;
	2069	struct vnode *vp;
	2070	struct fileproc *fp;
	2071	int sync_flag;
	2072	int error;
	2073
	2074	/*
	2075	* We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
	2076	*
	2077	* If AIO_DSYNC is set, we can tell the lower layers that it is OK
	2078	* to mark for update the metadata not strictly necessary for data
	2079	* retrieval, rather than forcing it to disk.
	2080	*
	2081	* If AIO_FSYNC is set, we have to also wait for metadata not really
	2082	* necessary to data retrival are committed to stable storage (e.g.
	2083	* atime, mtime, ctime, etc.).
	2084	*
	2085	* Metadata necessary for data retrieval ust be committed to stable
	2086	* storage in either case (file length, etc.).
	2087	*/
	2088	if (entryp->flags & AIO_FSYNC) {
	2089	sync_flag = MNT_WAIT;
	2090	} else {
	2091	sync_flag = MNT_DWAIT;
	2092	}
	2093
	2094	error = fp_get_ftype(p, entryp->aiocb.aio_fildes, DTYPE_VNODE, ENOTSUP, &fp);
	2095	if (error != 0) {
	2096	entryp->returnval = -1;
	2097	return error;
	2098	}
	2099	vp = fp->fp_glob->fg_data;
	2100
	2101	if ((error = vnode_getwithref(vp)) == 0) {
	2102	struct vfs_context context = {
	2103	.vc_thread = entryp->thread, /* XXX */
	2104	.vc_ucred = fp->fp_glob->fg_cred,
	2105	};
	2106
	2107	error = VNOP_FSYNC(vp, sync_flag, &context);
	2108
	2109	(void)vnode_put(vp);
	2110	} else {
	2111	entryp->returnval = -1;
	2112	}
	2113
	2114	fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
	2115	return error;
	2116	}
	2117
	2118
	2119	/*
	2120	* is_already_queued - runs through our queues to see if the given
	2121	* aiocbp / process is there. Returns TRUE if there is a match
	2122	* on any of our aio queues.
	2123	*
	2124	* Called with proc aio lock held (can be held spin)
	2125	*/
	2126	static boolean_t
	2127	is_already_queued(proc_t procp, user_addr_t aiocbp)
	2128	{
	2129	aio_workq_entry *entryp;
	2130	boolean_t result;
	2131
	2132	result = FALSE;
	2133
	2134	/* look for matches on our queue of async IO requests that have completed */
	2135	TAILQ_FOREACH(entryp, &procp->p_aio_doneq, aio_proc_link) {
	2136	if (aiocbp == entryp->uaiocbp) {
	2137	result = TRUE;
	2138	goto ExitThisRoutine;
	2139	}
	2140	}
	2141
	2142	/* look for matches on our queue of active async IO requests */
	2143	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
	2144	if (aiocbp == entryp->uaiocbp) {
	2145	result = TRUE;
	2146	goto ExitThisRoutine;
	2147	}
	2148	}
	2149
	2150	ExitThisRoutine:
	2151	return result;
	2152	}
	2153
	2154
	2155	/*
	2156	* aio initialization
	2157	*/
	2158	__private_extern__ void
	2159	aio_init(void)
	2160	{
	2161	for (int i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
	2162	aio_workq_init(&aio_anchor.aio_async_workqs[i]);
	2163	}
	2164
	2165	_aio_create_worker_threads(aio_worker_threads);
	2166	}
	2167
	2168
	2169	/*
	2170	* aio worker threads created here.
	2171	*/
	2172	__private_extern__ void
	2173	_aio_create_worker_threads(int num)
	2174	{
	2175	int i;
	2176
	2177	/* create some worker threads to handle the async IO requests */
	2178	for (i = 0; i < num; i++) {
	2179	thread_t myThread;
	2180
	2181	if (KERN_SUCCESS != kernel_thread_start(aio_work_thread, NULL, &myThread)) {
	2182	printf("%s - failed to create a work thread \n", __FUNCTION__);
	2183	} else {
	2184	thread_deallocate(myThread);
	2185	}
	2186	}
	2187	}
	2188
	2189	/*
	2190	* Return the current activation utask
	2191	*/
	2192	task_t
	2193	get_aiotask(void)
	2194	{
	2195	return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
	2196	}
	2197
	2198
	2199	/*
	2200	* In the case of an aiocb from a
	2201	* 32-bit process we need to expand some longs and pointers to the correct
	2202	* sizes in order to let downstream code always work on the same type of
	2203	* aiocb (in our case that is a user_aiocb)
	2204	*/
	2205	static void
	2206	do_munge_aiocb_user32_to_user(struct user32_aiocb my_aiocbp, struct user_aiocb the_user_aiocbp)
	2207	{
	2208	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
	2209	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
	2210	the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
	2211	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
	2212	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
	2213	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
	2214
	2215	/* special case here. since we do not know if sigev_value is an */
	2216	/* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
	2217	/* means if we send this info back to user space we need to remember */
	2218	/* sigev_value was not expanded for the 32-bit case. */
	2219	/* NOTE - this does NOT affect us since we don't support sigev_value */
	2220	/* yet in the aio context. */
	2221	//LP64
	2222	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
	2223	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
	2224	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
	2225	my_aiocbp->aio_sigevent.sigev_value.sival_int;
	2226	the_user_aiocbp->aio_sigevent.sigev_notify_function =
	2227	CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
	2228	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
	2229	CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
	2230	}
	2231
	2232	/* Similar for 64-bit user process, so that we don't need to satisfy
	2233	* the alignment constraints of the original user64_aiocb
	2234	*/
	2235	#if !__LP64__
	2236	__dead2
	2237	#endif
	2238	static void
	2239	do_munge_aiocb_user64_to_user(struct user64_aiocb my_aiocbp, struct user_aiocb the_user_aiocbp)
	2240	{
	2241	#if __LP64__
	2242	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
	2243	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
	2244	the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
	2245	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
	2246	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
	2247	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
	2248
	2249	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
	2250	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
	2251	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
	2252	my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
	2253	the_user_aiocbp->aio_sigevent.sigev_notify_function =
	2254	my_aiocbp->aio_sigevent.sigev_notify_function;
	2255	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
	2256	my_aiocbp->aio_sigevent.sigev_notify_attributes;
	2257	#else
	2258	#pragma unused(my_aiocbp, the_user_aiocbp)
	2259	panic("64bit process on 32bit kernel is not supported");
	2260	#endif
	2261	}