]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_aio.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
CommitLineData
55e303ae 1/*
f427ee49 2 * Copyright (c) 2003-2020 Apple Inc. All rights reserved.
55e303ae 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
55e303ae
A
27 */
28
29
30/*
31 * todo:
32 * 1) ramesh is looking into how to replace taking a reference on
0a7de745 33 * the user's map (vm_map_reference()) since it is believed that
55e303ae
A
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
0a7de745
A
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
55e303ae
A
38 */
39
40
41/*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45#include <sys/systm.h>
55e303ae 46#include <sys/fcntl.h>
91447636 47#include <sys/file_internal.h>
55e303ae
A
48#include <sys/filedesc.h>
49#include <sys/kernel.h>
91447636 50#include <sys/vnode_internal.h>
55e303ae 51#include <sys/malloc.h>
91447636 52#include <sys/mount_internal.h>
55e303ae 53#include <sys/param.h>
91447636 54#include <sys/proc_internal.h>
55e303ae
A
55#include <sys/sysctl.h>
56#include <sys/unistd.h>
57#include <sys/user.h>
58
59#include <sys/aio_kern.h>
91447636 60#include <sys/sysproto.h>
55e303ae
A
61
62#include <machine/limits.h>
91447636
A
63
64#include <mach/mach_types.h>
65#include <kern/kern_types.h>
3e170ce0 66#include <kern/waitq.h>
55e303ae
A
67#include <kern/zalloc.h>
68#include <kern/task.h>
91447636
A
69#include <kern/sched_prim.h>
70
71#include <vm/vm_map.h>
55e303ae 72
f427ee49 73#include <os/refcnt.h>
b0d623f7 74
55e303ae 75#include <sys/kdebug.h>
f427ee49
A
76#define AIO_work_queued 1
77#define AIO_worker_wake 2
78#define AIO_completion_sig 3
79#define AIO_completion_cleanup_wait 4
80#define AIO_completion_cleanup_wake 5
0a7de745 81#define AIO_completion_suspend_wake 6
f427ee49
A
82#define AIO_fsync_delay 7
83#define AIO_cancel 10
84#define AIO_cancel_async_workq 11
85#define AIO_cancel_sync_workq 12
86#define AIO_cancel_activeq 13
87#define AIO_cancel_doneq 14
88#define AIO_fsync 20
89#define AIO_read 30
90#define AIO_write 40
91#define AIO_listio 50
92#define AIO_error 60
93#define AIO_error_val 61
94#define AIO_error_activeq 62
95#define AIO_error_workq 63
96#define AIO_return 70
97#define AIO_return_val 71
98#define AIO_return_activeq 72
99#define AIO_return_workq 73
100#define AIO_exec 80
101#define AIO_exit 90
102#define AIO_exit_sleep 91
103#define AIO_close 100
104#define AIO_close_sleep 101
105#define AIO_suspend 110
106#define AIO_suspend_sleep 111
107#define AIO_worker_thread 120
108
109__options_decl(aio_entry_flags_t, uint32_t, {
110 AIO_READ = 0x00000001, /* a read */
111 AIO_WRITE = 0x00000002, /* a write */
112 AIO_FSYNC = 0x00000004, /* aio_fsync with op = O_SYNC */
113 AIO_DSYNC = 0x00000008, /* aio_fsync with op = O_DSYNC (not supported yet) */
114 AIO_LIO = 0x00000010, /* lio_listio generated IO */
115 AIO_LIO_WAIT = 0x00000020, /* lio_listio is waiting on the leader */
116
117 /*
118 * These flags mean that this entry is blocking either:
119 * - close (AIO_CLOSE_WAIT)
120 * - exit or exec (AIO_EXIT_WAIT)
121 *
122 * These flags are mutually exclusive, and the AIO_EXIT_WAIT variant
123 * will also neuter notifications in do_aio_completion_and_unlock().
124 */
125 AIO_CLOSE_WAIT = 0x00004000,
126 AIO_EXIT_WAIT = 0x00008000,
127});
128
129/*! @struct aio_workq_entry
130 *
131 * @discussion
132 * This represents a piece of aio/lio work.
133 *
134 * The ownership rules go as follows:
135 *
136 * - the "proc" owns one refcount on the entry (from creation), while it is
137 * enqueued on the aio_activeq and then the aio_doneq.
138 *
139 * either aio_return() (user read the status) or _aio_exit() (the process
140 * died) will dequeue the entry and consume this ref.
141 *
142 * - the async workqueue owns one refcount once the work is submitted,
143 * which is consumed in do_aio_completion_and_unlock().
144 *
145 * This ref protects the entry for the the end of
146 * do_aio_completion_and_unlock() (when signal delivery happens).
147 *
148 * - lio_listio() for batches picks one of the entries to be the "leader"
149 * of the batch. Each work item will have a refcount on its leader
150 * so that the accounting of the batch completion can be done on the leader
151 * (to be able to decrement lio_pending).
152 *
153 * This ref is consumed in do_aio_completion_and_unlock() as well.
154 *
155 * - lastly, in lio_listio() when the LIO_WAIT behavior is requested,
156 * an extra ref is taken in this syscall as it needs to keep accessing
157 * the leader "lio_pending" field until it hits 0.
158 */
159struct aio_workq_entry {
160 /* queue lock */
161 TAILQ_ENTRY(aio_workq_entry) aio_workq_link;
162
163 /* Proc lock */
164 TAILQ_ENTRY(aio_workq_entry) aio_proc_link; /* p_aio_activeq or p_aio_doneq */
165 user_ssize_t returnval; /* return value from read / write request */
166 errno_t errorval; /* error value from read / write request */
167 os_refcnt_t aio_refcount;
168 aio_entry_flags_t flags;
169
170 int lio_pending; /* pending I/Os in lio group, only on leader */
171 struct aio_workq_entry *lio_leader; /* pointer to the lio leader, can be self */
172
173 /* Initialized and never changed, safe to access */
174 struct proc *procp; /* user proc that queued this request */
175 user_addr_t uaiocbp; /* pointer passed in from user land */
176 struct user_aiocb aiocb; /* copy of aiocb from user land */
177 thread_t thread; /* thread that queued this request */
178
179 /* Initialized, and possibly freed by aio_work_thread() or at free if cancelled */
180 vm_map_t aio_map; /* user land map we have a reference to */
181};
55e303ae 182
0a7de745
A
183/*
184 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
185 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
186 * (proc.aio_activeq) when one of our worker threads start the IO.
55e303ae 187 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
0a7de745
A
188 * when the IO request completes. The request remains on aio_doneq until
189 * user process calls aio_return or the process exits, either way that is our
190 * trigger to release aio resources.
55e303ae 191 */
b0d623f7 192typedef struct aio_workq {
0a7de745 193 TAILQ_HEAD(, aio_workq_entry) aioq_entries;
f427ee49 194 lck_spin_t aioq_lock;
0a7de745 195 struct waitq aioq_waitq;
b0d623f7
A
196} *aio_workq_t;
197
198#define AIO_NUM_WORK_QUEUES 1
0a7de745 199struct aio_anchor_cb {
f427ee49 200 os_atomic(int) aio_total_count; /* total extant entries */
0a7de745 201
b0d623f7 202 /* Hash table of queues here */
0a7de745
A
203 int aio_num_workqs;
204 struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
55e303ae
A
205};
206typedef struct aio_anchor_cb aio_anchor_cb;
207
55e303ae
A
208/*
209 * Notes on aio sleep / wake channels.
210 * We currently pick a couple fields within the proc structure that will allow
211 * us sleep channels that currently do not collide with any other kernel routines.
212 * At this time, for binary compatibility reasons, we cannot create new proc fields.
213 */
f427ee49 214#define AIO_SUSPEND_SLEEP_CHAN p_aio_activeq
0a7de745 215#define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
55e303ae 216
0a7de745
A
217#define ASSERT_AIO_FROM_PROC(aiop, theproc) \
218 if ((aiop)->procp != (theproc)) { \
219 panic("AIO on a proc list that does not belong to that proc.\n"); \
b0d623f7 220 }
55e303ae
A
221
222/*
223 * LOCAL PROTOTYPES
224 */
0a7de745
A
225static void aio_proc_lock(proc_t procp);
226static void aio_proc_lock_spin(proc_t procp);
227static void aio_proc_unlock(proc_t procp);
f427ee49
A
228static lck_mtx_t *aio_proc_mutex(proc_t procp);
229static bool aio_has_active_requests_for_process(proc_t procp);
230static bool aio_proc_has_active_requests_for_file(proc_t procp, int fd);
231static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp);
232
0a7de745 233static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
0a7de745
A
234static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
235static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
0a7de745
A
236static void aio_entry_ref(aio_workq_entry *entryp);
237static void aio_entry_unref(aio_workq_entry *entryp);
f427ee49
A
238static bool aio_entry_try_workq_remove(aio_workq_entry *entryp);
239static boolean_t aio_delay_fsync_request(aio_workq_entry *entryp);
240static void aio_free_request(aio_workq_entry *entryp);
0a7de745
A
241
242static void aio_workq_init(aio_workq_t wq);
243static void aio_workq_lock_spin(aio_workq_t wq);
244static void aio_workq_unlock(aio_workq_t wq);
f427ee49
A
245static lck_spin_t *aio_workq_lock(aio_workq_t wq);
246
247static void aio_work_thread(void *arg, wait_result_t wr);
248static aio_workq_entry *aio_get_some_work(void);
249
250static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
251static int aio_validate(proc_t, aio_workq_entry *entryp);
252
253static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, aio_entry_flags_t);
254static void do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp);
255static int do_aio_fsync(aio_workq_entry *entryp);
256static int do_aio_read(aio_workq_entry *entryp);
257static int do_aio_write(aio_workq_entry *entryp);
258static void do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
259static void do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
260static aio_workq_entry *aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
261static int aio_copy_in_list(proc_t, user_addr_t, user_addr_t *, int);
262
263#define ASSERT_AIO_PROC_LOCK_OWNED(p) LCK_MTX_ASSERT(aio_proc_mutex(p), LCK_MTX_ASSERT_OWNED)
264#define ASSERT_AIO_WORKQ_LOCK_OWNED(q) LCK_SPIN_ASSERT(aio_workq_lock(q), LCK_ASSERT_OWNED)
91447636 265
55e303ae
A
266/*
267 * EXTERNAL PROTOTYPES
268 */
269
270/* in ...bsd/kern/sys_generic.c */
b0d623f7 271extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
0a7de745 272 user_addr_t bufp, user_size_t nbyte,
f427ee49 273 off_t offset, int flags, user_ssize_t *retval);
b0d623f7 274extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
0a7de745 275 user_addr_t bufp, user_size_t nbyte, off_t offset,
f427ee49 276 int flags, user_ssize_t *retval);
55e303ae
A
277
278/*
279 * aio external global variables.
280 */
0a7de745
A
281extern int aio_max_requests; /* AIO_MAX - configurable */
282extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
283extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
55e303ae
A
284
285
286/*
287 * aio static variables.
288 */
f427ee49
A
289static aio_anchor_cb aio_anchor = {
290 .aio_num_workqs = AIO_NUM_WORK_QUEUES,
291};
292os_refgrp_decl(static, aio_refgrp, "aio", NULL);
293static LCK_GRP_DECLARE(aio_proc_lock_grp, "aio_proc");
294static LCK_GRP_DECLARE(aio_queue_lock_grp, "aio_queue");
295static LCK_MTX_DECLARE(aio_proc_mtx, &aio_proc_lock_grp);
b0d623f7 296
f427ee49
A
297static ZONE_DECLARE(aio_workq_zonep, "aiowq", sizeof(aio_workq_entry),
298 ZC_ZFREE_CLEARMEM);
b0d623f7
A
299
300/* Hash */
301static aio_workq_t
0a7de745 302aio_entry_workq(__unused aio_workq_entry *entryp)
b0d623f7
A
303{
304 return &aio_anchor.aio_async_workqs[0];
305}
306
0a7de745 307static void
b0d623f7
A
308aio_workq_init(aio_workq_t wq)
309{
310 TAILQ_INIT(&wq->aioq_entries);
f427ee49 311 lck_spin_init(&wq->aioq_lock, &aio_queue_lock_grp, LCK_ATTR_NULL);
39037602 312 waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
b0d623f7
A
313}
314
315
0a7de745 316/*
b0d623f7
A
317 * Can be passed a queue which is locked spin.
318 */
0a7de745 319static void
b0d623f7
A
320aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
321{
322 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
323
324 if (entryp->aio_workq_link.tqe_prev == NULL) {
325 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
326 }
0a7de745 327
b0d623f7 328 TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
b0d623f7 329 entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
b0d623f7
A
330}
331
0a7de745 332static void
b0d623f7
A
333aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
334{
335 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
336
337 TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
b0d623f7
A
338}
339
0a7de745
A
340static void
341aio_proc_lock(proc_t procp)
b0d623f7
A
342{
343 lck_mtx_lock(aio_proc_mutex(procp));
344}
345
0a7de745 346static void
b0d623f7
A
347aio_proc_lock_spin(proc_t procp)
348{
349 lck_mtx_lock_spin(aio_proc_mutex(procp));
350}
351
f427ee49
A
352static bool
353aio_has_any_work(void)
354{
355 return os_atomic_load(&aio_anchor.aio_total_count, relaxed) != 0;
356}
357
358static bool
359aio_try_proc_insert_active_locked(proc_t procp, aio_workq_entry *entryp)
b0d623f7 360{
f427ee49
A
361 int old, new;
362
b0d623f7
A
363 ASSERT_AIO_PROC_LOCK_OWNED(procp);
364
f427ee49
A
365 if (procp->p_aio_total_count >= aio_max_requests_per_process) {
366 return false;
367 }
368
369 if (is_already_queued(procp, entryp->uaiocbp)) {
370 return false;
371 }
372
373 os_atomic_rmw_loop(&aio_anchor.aio_total_count, old, new, relaxed, {
374 if (old >= aio_max_requests) {
375 os_atomic_rmw_loop_give_up(return false);
376 }
377 new = old + 1;
378 });
379
380 TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
381 procp->p_aio_total_count++;
382 return true;
383}
384
385static void
386aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
387{
388 TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link);
389 TAILQ_INSERT_TAIL(&procp->p_aio_doneq, entryp, aio_proc_link);
b0d623f7
A
390}
391
392static void
393aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
394{
395 TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
f427ee49
A
396 entryp->aio_proc_link.tqe_prev = NULL;
397 if (os_atomic_dec_orig(&aio_anchor.aio_total_count, relaxed) <= 0) {
398 panic("Negative total AIO count!\n");
399 }
400 if (procp->p_aio_total_count-- <= 0) {
401 panic("proc %p: p_aio_total_count accounting mismatch", procp);
402 }
b0d623f7
A
403}
404
0a7de745 405static void
b0d623f7
A
406aio_proc_unlock(proc_t procp)
407{
408 lck_mtx_unlock(aio_proc_mutex(procp));
409}
410
411static lck_mtx_t*
412aio_proc_mutex(proc_t procp)
413{
414 return &procp->p_mlock;
415}
416
0a7de745 417static void
b0d623f7
A
418aio_entry_ref(aio_workq_entry *entryp)
419{
f427ee49 420 os_ref_retain(&entryp->aio_refcount);
b0d623f7 421}
f427ee49 422
0a7de745 423static void
b0d623f7
A
424aio_entry_unref(aio_workq_entry *entryp)
425{
f427ee49 426 if (os_ref_release(&entryp->aio_refcount) == 0) {
b0d623f7 427 aio_free_request(entryp);
b0d623f7 428 }
b0d623f7
A
429}
430
f427ee49 431static bool
b0d623f7 432aio_entry_try_workq_remove(aio_workq_entry *entryp)
0a7de745 433{
b0d623f7
A
434 /* Can only be cancelled if it's still on a work queue */
435 if (entryp->aio_workq_link.tqe_prev != NULL) {
436 aio_workq_t queue;
437
438 /* Will have to check again under the lock */
439 queue = aio_entry_workq(entryp);
440 aio_workq_lock_spin(queue);
441 if (entryp->aio_workq_link.tqe_prev != NULL) {
442 aio_workq_remove_entry_locked(queue, entryp);
443 aio_workq_unlock(queue);
f427ee49 444 return true;
0a7de745 445 } else {
b0d623f7
A
446 aio_workq_unlock(queue);
447 }
448 }
55e303ae 449
f427ee49 450 return false;
b0d623f7
A
451}
452
0a7de745 453static void
b0d623f7
A
454aio_workq_lock_spin(aio_workq_t wq)
455{
f427ee49 456 lck_spin_lock(aio_workq_lock(wq));
b0d623f7 457}
55e303ae 458
0a7de745 459static void
b0d623f7
A
460aio_workq_unlock(aio_workq_t wq)
461{
f427ee49 462 lck_spin_unlock(aio_workq_lock(wq));
b0d623f7 463}
55e303ae 464
f427ee49
A
465static lck_spin_t*
466aio_workq_lock(aio_workq_t wq)
b0d623f7 467{
f427ee49 468 return &wq->aioq_lock;
b0d623f7 469}
55e303ae
A
470
471/*
472 * aio_cancel - attempt to cancel one or more async IO requests currently
0a7de745 473 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
55e303ae
A
474 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
475 * is NULL then all outstanding async IO request for the given file
476 * descriptor are cancelled (if possible).
477 */
55e303ae 478int
f427ee49 479aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval)
55e303ae 480{
eb6b6ca3
A
481 struct user_aiocb my_aiocb;
482 int result;
55e303ae 483
eb6b6ca3 484 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_START,
f427ee49 485 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
55e303ae
A
486
487 /* quick check to see if there are any async IO requests queued up */
f427ee49 488 if (!aio_has_any_work()) {
2d21ac55
A
489 result = 0;
490 *retval = AIO_ALLDONE;
55e303ae
A
491 goto ExitRoutine;
492 }
0a7de745
A
493
494 *retval = -1;
495 if (uap->aiocbp != USER_ADDR_NULL) {
496 if (proc_is64bit(p)) {
b0d623f7 497 struct user64_aiocb aiocb64;
b0d623f7 498
f427ee49 499 result = copyin(uap->aiocbp, &aiocb64, sizeof(aiocb64));
0a7de745
A
500 if (result == 0) {
501 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
502 }
b0d623f7
A
503 } else {
504 struct user32_aiocb aiocb32;
91447636 505
f427ee49 506 result = copyin(uap->aiocbp, &aiocb32, sizeof(aiocb32));
0a7de745 507 if (result == 0) {
f427ee49 508 do_munge_aiocb_user32_to_user(&aiocb32, &my_aiocb);
0a7de745 509 }
b0d623f7 510 }
91447636 511
0a7de745
A
512 if (result != 0) {
513 result = EAGAIN;
55e303ae
A
514 goto ExitRoutine;
515 }
516
517 /* NOTE - POSIX standard says a mismatch between the file */
518 /* descriptor passed in and the file descriptor embedded in */
519 /* the aiocb causes unspecified results. We return EBADF in */
520 /* that situation. */
0a7de745 521 if (uap->fd != my_aiocb.aio_fildes) {
55e303ae
A
522 result = EBADF;
523 goto ExitRoutine;
524 }
525 }
b0d623f7
A
526
527 aio_proc_lock(p);
f427ee49 528 result = do_aio_cancel_locked(p, uap->fd, uap->aiocbp, 0);
b0d623f7
A
529 ASSERT_AIO_PROC_LOCK_OWNED(p);
530 aio_proc_unlock(p);
55e303ae 531
0a7de745 532 if (result != -1) {
55e303ae
A
533 *retval = result;
534 result = 0;
535 goto ExitRoutine;
536 }
0a7de745 537
55e303ae 538 result = EBADF;
55e303ae 539
0a7de745 540ExitRoutine:
eb6b6ca3 541 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_END,
f427ee49 542 VM_KERNEL_ADDRPERM(p), uap->aiocbp, result, 0, 0);
55e303ae 543
0a7de745 544 return result;
f427ee49 545}
55e303ae
A
546
547
548/*
0a7de745
A
549 * _aio_close - internal function used to clean up async IO requests for
550 * a file descriptor that is closing.
55e303ae
A
551 * THIS MAY BLOCK.
552 */
55e303ae 553__private_extern__ void
f427ee49 554_aio_close(proc_t p, int fd)
55e303ae 555{
eb6b6ca3 556 int error;
55e303ae
A
557
558 /* quick check to see if there are any async IO requests queued up */
f427ee49 559 if (!aio_has_any_work()) {
55e303ae 560 return;
b0d623f7 561 }
55e303ae 562
eb6b6ca3
A
563 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_START,
564 VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
0a7de745 565
55e303ae 566 /* cancel all async IO requests on our todo queues for this file descriptor */
b0d623f7 567 aio_proc_lock(p);
f427ee49 568 error = do_aio_cancel_locked(p, fd, USER_ADDR_NULL, AIO_CLOSE_WAIT);
b0d623f7 569 ASSERT_AIO_PROC_LOCK_OWNED(p);
0a7de745
A
570 if (error == AIO_NOTCANCELED) {
571 /*
572 * AIO_NOTCANCELED is returned when we find an aio request for this process
573 * and file descriptor on the active async IO queue. Active requests cannot
574 * be cancelled so we must wait for them to complete. We will get a special
575 * wake up call on our channel used to sleep for ALL active requests to
576 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
577 * when we must wait for all active aio requests.
55e303ae
A
578 */
579
eb6b6ca3
A
580 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep) | DBG_FUNC_NONE,
581 VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
55e303ae 582
f427ee49
A
583 while (aio_proc_has_active_requests_for_file(p, fd)) {
584 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0);
b0d623f7 585 }
55e303ae 586 }
0a7de745 587
39236c6e 588 aio_proc_unlock(p);
0a7de745 589
eb6b6ca3
A
590 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_END,
591 VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
f427ee49 592}
55e303ae
A
593
594
595/*
596 * aio_error - return the error status associated with the async IO
597 * request referred to by uap->aiocbp. The error status is the errno
598 * value that would be set by the corresponding IO request (read, wrtie,
599 * fdatasync, or sync).
600 */
55e303ae 601int
f427ee49 602aio_error(proc_t p, struct aio_error_args *uap, int *retval)
55e303ae 603{
eb6b6ca3
A
604 aio_workq_entry *entryp;
605 int error;
55e303ae 606
eb6b6ca3 607 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_START,
f427ee49 608 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
55e303ae 609
b0d623f7 610 /* see if there are any aios to check */
f427ee49 611 if (!aio_has_any_work()) {
b0d623f7 612 return EINVAL;
55e303ae 613 }
0a7de745 614
b0d623f7 615 aio_proc_lock(p);
0a7de745 616
55e303ae 617 /* look for a match on our queue of async IO requests that have completed */
f427ee49 618 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
0a7de745 619 if (entryp->uaiocbp == uap->aiocbp) {
b0d623f7
A
620 ASSERT_AIO_FROM_PROC(entryp, p);
621
55e303ae
A
622 *retval = entryp->errorval;
623 error = 0;
f427ee49 624
eb6b6ca3 625 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val) | DBG_FUNC_NONE,
f427ee49 626 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
55e303ae
A
627 goto ExitRoutine;
628 }
629 }
0a7de745 630
55e303ae 631 /* look for a match on our queue of active async IO requests */
f427ee49 632 TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
0a7de745 633 if (entryp->uaiocbp == uap->aiocbp) {
b0d623f7 634 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae
A
635 *retval = EINPROGRESS;
636 error = 0;
eb6b6ca3 637 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq) | DBG_FUNC_NONE,
f427ee49 638 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
55e303ae
A
639 goto ExitRoutine;
640 }
641 }
b0d623f7 642
55e303ae 643 error = EINVAL;
0a7de745 644
55e303ae 645ExitRoutine:
eb6b6ca3 646 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_END,
f427ee49 647 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
b0d623f7 648 aio_proc_unlock(p);
55e303ae 649
0a7de745 650 return error;
f427ee49 651}
55e303ae
A
652
653
654/*
0a7de745
A
655 * aio_fsync - asynchronously force all IO operations associated
656 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
55e303ae 657 * queued at the time of the call to the synchronized completion state.
0a7de745 658 * NOTE - we do not support op O_DSYNC at this point since we do not support the
55e303ae
A
659 * fdatasync() call.
660 */
55e303ae 661int
f427ee49 662aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval)
55e303ae 663{
f427ee49 664 aio_entry_flags_t fsync_kind;
eb6b6ca3 665 int error;
55e303ae 666
eb6b6ca3 667 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_START,
f427ee49 668 VM_KERNEL_ADDRPERM(p), uap->aiocbp, uap->op, 0, 0);
55e303ae
A
669
670 *retval = 0;
91447636 671 /* 0 := O_SYNC for binary backward compatibility with Panther */
0a7de745 672 if (uap->op == O_SYNC || uap->op == 0) {
55e303ae 673 fsync_kind = AIO_FSYNC;
0a7de745 674 } else if (uap->op == O_DSYNC) {
55e303ae 675 fsync_kind = AIO_DSYNC;
0a7de745 676 } else {
55e303ae
A
677 *retval = -1;
678 error = EINVAL;
679 goto ExitRoutine;
680 }
0a7de745 681
f427ee49 682 error = aio_queue_async_request(p, uap->aiocbp, fsync_kind);
0a7de745 683 if (error != 0) {
55e303ae 684 *retval = -1;
0a7de745 685 }
55e303ae 686
0a7de745 687ExitRoutine:
eb6b6ca3 688 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_END,
f427ee49 689 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
55e303ae 690
0a7de745 691 return error;
f427ee49 692}
55e303ae
A
693
694
0a7de745
A
695/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
696 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
55e303ae
A
697 * (uap->aiocbp->aio_buf).
698 */
55e303ae 699int
f427ee49 700aio_read(proc_t p, struct aio_read_args *uap, int *retval)
55e303ae 701{
eb6b6ca3 702 int error;
0a7de745 703
eb6b6ca3 704 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_START,
f427ee49 705 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
55e303ae 706
55e303ae
A
707 *retval = 0;
708
f427ee49 709 error = aio_queue_async_request(p, uap->aiocbp, AIO_READ);
0a7de745 710 if (error != 0) {
55e303ae 711 *retval = -1;
0a7de745 712 }
55e303ae 713
eb6b6ca3 714 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_END,
f427ee49 715 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
55e303ae 716
0a7de745 717 return error;
f427ee49 718}
55e303ae
A
719
720
721/*
722 * aio_return - return the return status associated with the async IO
723 * request referred to by uap->aiocbp. The return status is the value
b0d623f7 724 * that would be returned by corresponding IO request (read, write,
0a7de745 725 * fdatasync, or sync). This is where we release kernel resources
55e303ae
A
726 * held for async IO call associated with the given aiocb pointer.
727 */
55e303ae 728int
f427ee49 729aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval)
55e303ae 730{
eb6b6ca3 731 aio_workq_entry *entryp;
f427ee49 732 int error = EINVAL;
0a7de745 733
eb6b6ca3 734 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_START,
f427ee49 735 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
55e303ae 736
b0d623f7 737 /* See if there are any entries to check */
f427ee49 738 if (!aio_has_any_work()) {
55e303ae
A
739 goto ExitRoutine;
740 }
741
b0d623f7 742 aio_proc_lock(p);
b0d623f7
A
743 *retval = 0;
744
55e303ae 745 /* look for a match on our queue of async IO requests that have completed */
f427ee49 746 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
b0d623f7 747 ASSERT_AIO_FROM_PROC(entryp, p);
0a7de745 748 if (entryp->uaiocbp == uap->aiocbp) {
b0d623f7
A
749 /* Done and valid for aio_return(), pull it off the list */
750 aio_proc_remove_done_locked(p, entryp);
0a7de745 751
55e303ae 752 *retval = entryp->returnval;
b0d623f7 753 error = 0;
f427ee49 754 aio_proc_unlock(p);
55e303ae 755
f427ee49 756 aio_entry_unref(entryp);
b0d623f7 757
eb6b6ca3 758 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val) | DBG_FUNC_NONE,
f427ee49 759 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
55e303ae
A
760 goto ExitRoutine;
761 }
762 }
0a7de745 763
55e303ae 764 /* look for a match on our queue of active async IO requests */
f427ee49 765 TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
b0d623f7 766 ASSERT_AIO_FROM_PROC(entryp, p);
0a7de745 767 if (entryp->uaiocbp == uap->aiocbp) {
55e303ae 768 error = EINPROGRESS;
eb6b6ca3 769 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq) | DBG_FUNC_NONE,
f427ee49
A
770 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
771 break;
55e303ae
A
772 }
773 }
0a7de745 774
f427ee49 775 aio_proc_unlock(p);
0a7de745 776
55e303ae 777ExitRoutine:
eb6b6ca3 778 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_END,
f427ee49 779 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
55e303ae 780
0a7de745 781 return error;
f427ee49 782}
55e303ae
A
783
784
785/*
0a7de745
A
786 * _aio_exec - internal function used to clean up async IO requests for
787 * a process that is going away due to exec(). We cancel any async IOs
55e303ae 788 * we can and wait for those already active. We also disable signaling
0a7de745 789 * for cancelled or active aio requests that complete.
55e303ae
A
790 * This routine MAY block!
791 */
55e303ae 792__private_extern__ void
f427ee49 793_aio_exec(proc_t p)
55e303ae 794{
eb6b6ca3
A
795 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_START,
796 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
55e303ae 797
f427ee49 798 _aio_exit(p);
55e303ae 799
eb6b6ca3
A
800 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_END,
801 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
f427ee49 802}
55e303ae
A
803
804
805/*
0a7de745 806 * _aio_exit - internal function used to clean up async IO requests for
f427ee49 807 * a process that is terminating (via exit() or exec()). We cancel any async IOs
55e303ae
A
808 * we can and wait for those already active. We also disable signaling
809 * for cancelled or active aio requests that complete. This routine MAY block!
55e303ae 810 */
55e303ae 811__private_extern__ void
f427ee49 812_aio_exit(proc_t p)
55e303ae 813{
f427ee49
A
814 TAILQ_HEAD(, aio_workq_entry) tofree = TAILQ_HEAD_INITIALIZER(tofree);
815 aio_workq_entry *entryp, *tmp;
eb6b6ca3 816 int error;
b0d623f7 817
55e303ae 818 /* quick check to see if there are any async IO requests queued up */
f427ee49 819 if (!aio_has_any_work()) {
55e303ae
A
820 return;
821 }
822
eb6b6ca3
A
823 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_START,
824 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
55e303ae 825
b0d623f7
A
826 aio_proc_lock(p);
827
0a7de745
A
828 /*
829 * cancel async IO requests on the todo work queue and wait for those
830 * already active to complete.
55e303ae 831 */
f427ee49 832 error = do_aio_cancel_locked(p, -1, USER_ADDR_NULL, AIO_EXIT_WAIT);
b0d623f7 833 ASSERT_AIO_PROC_LOCK_OWNED(p);
0a7de745
A
834 if (error == AIO_NOTCANCELED) {
835 /*
836 * AIO_NOTCANCELED is returned when we find an aio request for this process
837 * on the active async IO queue. Active requests cannot be cancelled so we
838 * must wait for them to complete. We will get a special wake up call on
839 * our channel used to sleep for ALL active requests to complete. This sleep
840 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
841 * active aio requests.
55e303ae
A
842 */
843
eb6b6ca3
A
844 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep) | DBG_FUNC_NONE,
845 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
55e303ae 846
f427ee49
A
847 while (aio_has_active_requests_for_process(p)) {
848 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0);
b0d623f7
A
849 }
850 }
0a7de745 851
f427ee49 852 assert(!aio_has_active_requests_for_process(p));
0a7de745 853
55e303ae 854 /* release all aio resources used by this process */
f427ee49 855 TAILQ_FOREACH_SAFE(entryp, &p->p_aio_doneq, aio_proc_link, tmp) {
b0d623f7 856 ASSERT_AIO_FROM_PROC(entryp, p);
0a7de745 857
b0d623f7 858 aio_proc_remove_done_locked(p, entryp);
f427ee49 859 TAILQ_INSERT_TAIL(&tofree, entryp, aio_proc_link);
55e303ae 860 }
0a7de745 861
b0d623f7 862 aio_proc_unlock(p);
0a7de745 863
f427ee49
A
864 /* free all the entries outside of the aio_proc_lock() */
865 TAILQ_FOREACH_SAFE(entryp, &tofree, aio_proc_link, tmp) {
866 entryp->aio_proc_link.tqe_prev = NULL;
867 aio_entry_unref(entryp);
868 }
869
eb6b6ca3
A
870 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_END,
871 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
f427ee49 872}
55e303ae
A
873
874
f427ee49
A
875static bool
876should_cancel(aio_workq_entry *entryp, int fd, user_addr_t aiocbp,
877 aio_entry_flags_t reason)
b0d623f7 878{
f427ee49
A
879 if (reason & AIO_EXIT_WAIT) {
880 /* caller is _aio_exit() */
881 return true;
b0d623f7 882 }
f427ee49
A
883 if (fd != entryp->aiocb.aio_fildes) {
884 /* not the file we're looking for */
885 return false;
886 }
887 /*
888 * aio_cancel() or _aio_close() cancel
889 * everything for a given fd when aiocbp is NULL
890 */
891 return aiocbp == USER_ADDR_NULL || entryp->uaiocbp == aiocbp;
b0d623f7
A
892}
893
55e303ae 894/*
b0d623f7 895 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
0a7de745
A
896 * aio_cancel, close, and at exit.
897 * There are three modes of operation: 1) cancel all async IOs for a process -
898 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
55e303ae
A
899 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
900 * aiocbp.
0a7de745
A
901 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
902 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
903 * target async IO requests, and AIO_ALLDONE if all target async IO requests
55e303ae 904 * were already complete.
0a7de745 905 * WARNING - do not deference aiocbp in this routine, it may point to user
f427ee49 906 * land data that has not been copied in (when called from aio_cancel())
b0d623f7
A
907 *
908 * Called with proc locked, and returns the same way.
55e303ae 909 */
55e303ae 910static int
0a7de745 911do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
f427ee49 912 aio_entry_flags_t reason)
55e303ae 913{
f427ee49
A
914 bool multiple_matches = (aiocbp == USER_ADDR_NULL);
915 aio_workq_entry *entryp, *tmp;
916 int result;
55e303ae 917
f427ee49 918 ASSERT_AIO_PROC_LOCK_OWNED(p);
0a7de745 919
55e303ae 920 /* look for a match on our queue of async todo work. */
f427ee49
A
921again:
922 result = -1;
923 TAILQ_FOREACH_SAFE(entryp, &p->p_aio_activeq, aio_proc_link, tmp) {
b0d623f7 924 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae 925
f427ee49 926 if (!should_cancel(entryp, fd, aiocbp, reason)) {
b0d623f7 927 continue;
55e303ae 928 }
b0d623f7 929
f427ee49
A
930 if (reason) {
931 /* mark the entry as blocking close or exit/exec */
932 entryp->flags |= reason;
933 if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
934 panic("Close and exit flags set at the same time\n");
935 }
936 }
b0d623f7 937
f427ee49
A
938 /* Can only be cancelled if it's still on a work queue */
939 if (aio_entry_try_workq_remove(entryp)) {
940 entryp->errorval = ECANCELED;
941 entryp->returnval = -1;
b0d623f7
A
942
943 /* Now it's officially cancelled. Do the completion */
eb6b6ca3
A
944 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) | DBG_FUNC_NONE,
945 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
946 fd, 0, 0);
f427ee49 947 do_aio_completion_and_unlock(p, entryp);
b0d623f7 948
b0d623f7
A
949 aio_proc_lock(p);
950
f427ee49
A
951 if (multiple_matches) {
952 /*
953 * Restart from the head of the proc active queue since it
954 * may have been changed while we were away doing completion
955 * processing.
956 *
957 * Note that if we found an uncancellable AIO before, we will
958 * either find it again or discover that it's been completed,
959 * so resetting the result will not cause us to return success
960 * despite outstanding AIOs.
961 */
962 goto again;
55e303ae 963 }
55e303ae 964
f427ee49
A
965 return AIO_CANCELED;
966 }
b0d623f7 967
f427ee49
A
968 /*
969 * It's been taken off the active queue already, i.e. is in flight.
970 * All we can do is ask for notification.
971 */
972 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq) | DBG_FUNC_NONE,
973 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
974 fd, 0, 0);
55e303ae 975
f427ee49
A
976 result = AIO_NOTCANCELED;
977 if (!multiple_matches) {
978 return result;
55e303ae 979 }
f427ee49 980 }
0a7de745
A
981
982 /*
983 * if we didn't find any matches on the todo or active queues then look for a
984 * match on our queue of async IO requests that have completed and if found
985 * return AIO_ALLDONE result.
b0d623f7
A
986 *
987 * Proc AIO lock is still held.
55e303ae 988 */
0a7de745 989 if (result == -1) {
b0d623f7
A
990 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
991 ASSERT_AIO_FROM_PROC(entryp, p);
f427ee49 992 if (should_cancel(entryp, fd, aiocbp, reason)) {
eb6b6ca3
A
993 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq) | DBG_FUNC_NONE,
994 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
995 fd, 0, 0);
55e303ae 996
f427ee49
A
997 result = AIO_ALLDONE;
998 if (!multiple_matches) {
0a7de745 999 return result;
55e303ae
A
1000 }
1001 }
1002 }
1003 }
55e303ae 1004
0a7de745 1005 return result;
b0d623f7 1006}
55e303ae
A
1007
1008
1009/*
1010 * aio_suspend - suspend the calling thread until at least one of the async
1011 * IO operations referenced by uap->aiocblist has completed, until a signal
1012 * interrupts the function, or uap->timeoutp time interval (optional) has
1013 * passed.
1014 * Returns 0 if one or more async IOs have completed else -1 and errno is
1015 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1016 * woke us up.
1017 */
2d21ac55 1018int
f427ee49 1019aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval)
2d21ac55
A
1020{
1021 __pthread_testcancel(1);
0a7de745 1022 return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
2d21ac55
A
1023}
1024
55e303ae
A
1025
1026int
f427ee49 1027aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval)
55e303ae 1028{
eb6b6ca3 1029 int error;
f427ee49 1030 int i;
eb6b6ca3
A
1031 uint64_t abstime;
1032 struct user_timespec ts;
1033 aio_workq_entry *entryp;
1034 user_addr_t *aiocbpp;
f427ee49 1035 size_t aiocbpp_size;
0a7de745 1036
eb6b6ca3
A
1037 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_START,
1038 VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
55e303ae
A
1039
1040 *retval = -1;
1041 abstime = 0;
1042 aiocbpp = NULL;
1043
f427ee49 1044 if (!aio_has_any_work()) {
55e303ae
A
1045 error = EINVAL;
1046 goto ExitThisRoutine;
1047 }
1048
f427ee49
A
1049 if (uap->nent < 1 || uap->nent > aio_max_requests_per_process ||
1050 os_mul_overflow(sizeof(user_addr_t), uap->nent, &aiocbpp_size)) {
55e303ae
A
1051 error = EINVAL;
1052 goto ExitThisRoutine;
1053 }
1054
0a7de745
A
1055 if (uap->timeoutp != USER_ADDR_NULL) {
1056 if (proc_is64bit(p)) {
b0d623f7 1057 struct user64_timespec temp;
f427ee49 1058 error = copyin(uap->timeoutp, &temp, sizeof(temp));
0a7de745 1059 if (error == 0) {
f427ee49
A
1060 ts.tv_sec = (user_time_t)temp.tv_sec;
1061 ts.tv_nsec = (user_long_t)temp.tv_nsec;
b0d623f7 1062 }
0a7de745 1063 } else {
b0d623f7 1064 struct user32_timespec temp;
f427ee49 1065 error = copyin(uap->timeoutp, &temp, sizeof(temp));
0a7de745 1066 if (error == 0) {
91447636
A
1067 ts.tv_sec = temp.tv_sec;
1068 ts.tv_nsec = temp.tv_nsec;
1069 }
1070 }
0a7de745 1071 if (error != 0) {
55e303ae
A
1072 error = EAGAIN;
1073 goto ExitThisRoutine;
1074 }
0a7de745
A
1075
1076 if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
55e303ae
A
1077 error = EINVAL;
1078 goto ExitThisRoutine;
1079 }
1080
0a7de745 1081 nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
f427ee49
A
1082 &abstime);
1083 clock_absolutetime_interval_to_deadline(abstime, &abstime);
55e303ae
A
1084 }
1085
f427ee49
A
1086 aiocbpp = kheap_alloc(KHEAP_TEMP, aiocbpp_size, Z_WAITOK);
1087 if (aiocbpp == NULL || aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
55e303ae
A
1088 error = EAGAIN;
1089 goto ExitThisRoutine;
1090 }
1091
91447636 1092 /* check list of aio requests to see if any have completed */
2d21ac55 1093check_for_our_aiocbp:
b0d623f7 1094 aio_proc_lock_spin(p);
0a7de745
A
1095 for (i = 0; i < uap->nent; i++) {
1096 user_addr_t aiocbp;
91447636 1097
55e303ae
A
1098 /* NULL elements are legal so check for 'em */
1099 aiocbp = *(aiocbpp + i);
0a7de745 1100 if (aiocbp == USER_ADDR_NULL) {
55e303ae 1101 continue;
0a7de745
A
1102 }
1103
55e303ae 1104 /* return immediately if any aio request in the list is done */
f427ee49 1105 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
b0d623f7 1106 ASSERT_AIO_FROM_PROC(entryp, p);
0a7de745 1107 if (entryp->uaiocbp == aiocbp) {
b0d623f7 1108 aio_proc_unlock(p);
55e303ae
A
1109 *retval = 0;
1110 error = 0;
55e303ae
A
1111 goto ExitThisRoutine;
1112 }
1113 }
f427ee49 1114 }
55e303ae 1115
eb6b6ca3
A
1116 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep) | DBG_FUNC_NONE,
1117 VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
0a7de745
A
1118
1119 /*
1120 * wait for an async IO to complete or a signal fires or timeout expires.
1121 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1122 * interrupts us. If an async IO completes before a signal fires or our
91447636 1123 * timeout expires, we get a wakeup call from aio_work_thread().
55e303ae 1124 */
91447636 1125
f427ee49
A
1126 error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p),
1127 PCATCH | PWAIT | PDROP, "aio_suspend", abstime);
0a7de745
A
1128 if (error == 0) {
1129 /*
2d21ac55 1130 * got our wakeup call from aio_work_thread().
0a7de745
A
1131 * Since we can get a wakeup on this channel from another thread in the
1132 * same process we head back up to make sure this is for the correct aiocbp.
1133 * If it is the correct aiocbp we will return from where we do the check
2d21ac55 1134 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
0a7de745 1135 * else we will fall out and just sleep again.
2d21ac55
A
1136 */
1137 goto check_for_our_aiocbp;
0a7de745 1138 } else if (error == EWOULDBLOCK) {
55e303ae
A
1139 /* our timeout expired */
1140 error = EAGAIN;
0a7de745 1141 } else {
55e303ae 1142 /* we were interrupted */
55e303ae
A
1143 error = EINTR;
1144 }
1145
1146ExitThisRoutine:
0a7de745 1147 if (aiocbpp != NULL) {
f427ee49 1148 kheap_free(KHEAP_TEMP, aiocbpp, aiocbpp_size);
0a7de745 1149 }
55e303ae 1150
eb6b6ca3
A
1151 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_END,
1152 VM_KERNEL_ADDRPERM(p), uap->nent, error, 0, 0);
55e303ae 1153
0a7de745 1154 return error;
f427ee49 1155}
55e303ae
A
1156
1157
0a7de745
A
1158/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1159 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
55e303ae
A
1160 * (uap->aiocbp->aio_buf).
1161 */
1162
1163int
eb6b6ca3 1164aio_write(proc_t p, struct aio_write_args *uap, int *retval __unused)
55e303ae 1165{
eb6b6ca3 1166 int error;
0a7de745 1167
eb6b6ca3 1168 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_START,
f427ee49 1169 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
55e303ae 1170
f427ee49 1171 error = aio_queue_async_request(p, uap->aiocbp, AIO_WRITE);
55e303ae 1172
eb6b6ca3 1173 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_END,
f427ee49 1174 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
55e303ae 1175
0a7de745 1176 return error;
f427ee49 1177}
55e303ae
A
1178
1179
f427ee49
A
1180static int
1181aio_copy_in_list(proc_t procp, user_addr_t aiocblist, user_addr_t *aiocbpp,
1182 int nent)
55e303ae 1183{
f427ee49 1184 int result;
b0d623f7
A
1185
1186 /* copyin our aiocb pointers from list */
f427ee49 1187 result = copyin(aiocblist, aiocbpp,
0a7de745
A
1188 proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1189 : (nent * sizeof(user32_addr_t)));
1190 if (result) {
f427ee49 1191 return result;
b0d623f7
A
1192 }
1193
1194 /*
1195 * We depend on a list of user_addr_t's so we need to
1196 * munge and expand when these pointers came from a
1197 * 32-bit process
1198 */
0a7de745 1199 if (!proc_is64bit(procp)) {
b0d623f7
A
1200 /* copy from last to first to deal with overlap */
1201 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1202 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1203
f427ee49 1204 for (int i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
b0d623f7
A
1205 *my_addrp = (user_addr_t) (*my_ptrp);
1206 }
1207 }
1208
f427ee49 1209 return 0;
b0d623f7
A
1210}
1211
1212
1213static int
1214aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1215{
0a7de745 1216 int result = 0;
b0d623f7 1217
0a7de745 1218 if (sigp == USER_ADDR_NULL) {
b0d623f7 1219 goto out;
0a7de745 1220 }
b0d623f7
A
1221
1222 /*
1223 * We need to munge aio_sigevent since it contains pointers.
1224 * Since we do not know if sigev_value is an int or a ptr we do
1225 * NOT cast the ptr to a user_addr_t. This means if we send
1226 * this info back to user space we need to remember sigev_value
1227 * was not expanded for the 32-bit case.
1228 *
1229 * Notes: This does NOT affect us since we don't support
1230 * sigev_value yet in the aio context.
1231 */
0a7de745 1232 if (proc_is64bit(procp)) {
f427ee49 1233#if __LP64__
b0d623f7
A
1234 struct user64_sigevent sigevent64;
1235
f427ee49 1236 result = copyin(sigp, &sigevent64, sizeof(sigevent64));
0a7de745 1237 if (result == 0) {
b0d623f7
A
1238 sigev->sigev_notify = sigevent64.sigev_notify;
1239 sigev->sigev_signo = sigevent64.sigev_signo;
1240 sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1241 sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1242 sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1243 }
f427ee49
A
1244#else
1245 panic("64bit process on 32bit kernel is not supported");
1246#endif
b0d623f7
A
1247 } else {
1248 struct user32_sigevent sigevent32;
1249
f427ee49 1250 result = copyin(sigp, &sigevent32, sizeof(sigevent32));
0a7de745 1251 if (result == 0) {
b0d623f7
A
1252 sigev->sigev_notify = sigevent32.sigev_notify;
1253 sigev->sigev_signo = sigevent32.sigev_signo;
1254 sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1255 sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1256 sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1257 }
1258 }
1259
0a7de745 1260 if (result != 0) {
b0d623f7
A
1261 result = EAGAIN;
1262 }
1263
1264out:
0a7de745 1265 return result;
b0d623f7
A
1266}
1267
cb323159
A
1268/*
1269 * validate user_sigevent. at this point we only support
1270 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
1271 * sigev_value, sigev_notify_function, and sigev_notify_attributes
1272 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
1273 * with no [RTS] (RalTime Signal) option group support.
1274 */
1275static int
f427ee49 1276aio_sigev_validate(const struct user_sigevent *sigev)
cb323159
A
1277{
1278 switch (sigev->sigev_notify) {
1279 case SIGEV_SIGNAL:
1280 {
1281 int signum;
1282
1283 /* make sure we have a valid signal number */
1284 signum = sigev->sigev_signo;
1285 if (signum <= 0 || signum >= NSIG ||
1286 signum == SIGKILL || signum == SIGSTOP) {
1287 return EINVAL;
1288 }
1289 }
1290 break;
1291
1292 case SIGEV_NONE:
1293 break;
1294
1295 case SIGEV_THREAD:
1296 /* Unsupported [RTS] */
1297
1298 default:
1299 return EINVAL;
1300 }
1301
1302 return 0;
1303}
1304
1305
b0d623f7 1306/*
f427ee49 1307 * aio_try_enqueue_work_locked
b0d623f7
A
1308 *
1309 * Queue up the entry on the aio asynchronous work queue in priority order
1310 * based on the relative priority of the request. We calculate the relative
1311 * priority using the nice value of the caller and the value
1312 *
1313 * Parameters: procp Process queueing the I/O
1314 * entryp The work queue entry being queued
f427ee49 1315 * leader The work leader if any
b0d623f7 1316 *
f427ee49 1317 * Returns: Wether the enqueue was successful
b0d623f7
A
1318 *
1319 * Notes: This function is used for both lio_listio and aio
1320 *
1321 * XXX: At some point, we may have to consider thread priority
1322 * rather than process priority, but we don't maintain the
1323 * adjusted priority for threads the POSIX way.
1324 *
b0d623f7
A
1325 * Called with proc locked.
1326 */
f427ee49
A
1327static bool
1328aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp,
1329 aio_workq_entry *leader)
b0d623f7 1330{
b0d623f7
A
1331 aio_workq_t queue = aio_entry_workq(entryp);
1332
b0d623f7
A
1333 ASSERT_AIO_PROC_LOCK_OWNED(procp);
1334
1335 /* Onto proc queue */
f427ee49
A
1336 if (!aio_try_proc_insert_active_locked(procp, entryp)) {
1337 return false;
1338 }
1339
1340 if (leader) {
1341 aio_entry_ref(leader); /* consumed in do_aio_completion_and_unlock */
1342 leader->lio_pending++;
1343 entryp->lio_leader = leader;
1344 }
b0d623f7
A
1345
1346 /* And work queue */
f427ee49 1347 aio_entry_ref(entryp); /* consumed in do_aio_completion_and_unlock */
b0d623f7
A
1348 aio_workq_lock_spin(queue);
1349 aio_workq_add_entry_locked(queue, entryp);
3e170ce0 1350 waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
0a7de745 1351 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
b0d623f7 1352 aio_workq_unlock(queue);
0a7de745 1353
eb6b6ca3
A
1354 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_START,
1355 VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
f427ee49 1356 entryp->flags, entryp->aiocb.aio_fildes, 0);
eb6b6ca3
A
1357 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_END,
1358 entryp->aiocb.aio_offset, 0, entryp->aiocb.aio_nbytes, 0, 0);
f427ee49 1359 return true;
b0d623f7
A
1360}
1361
1362
1363/*
1364 * lio_listio - initiate a list of IO requests. We process the list of
1365 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1366 * (mode == LIO_NOWAIT).
1367 *
1368 * The caller gets error and return status for each aiocb in the list
1369 * via aio_error and aio_return. We must keep completed requests until
1370 * released by the aio_return call.
1371 */
1372int
f427ee49 1373lio_listio(proc_t p, struct lio_listio_args *uap, int *retval __unused)
b0d623f7 1374{
f427ee49
A
1375 aio_workq_entry *entries[AIO_LISTIO_MAX] = { };
1376 user_addr_t aiocbpp[AIO_LISTIO_MAX];
1377 struct user_sigevent aiosigev = { };
1378 int result = 0;
1379 int lio_count = 0;
0a7de745 1380
eb6b6ca3
A
1381 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_START,
1382 VM_KERNEL_ADDRPERM(p), uap->nent, uap->mode, 0, 0);
0a7de745 1383
0a7de745 1384 if (!(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT)) {
f427ee49 1385 result = EINVAL;
55e303ae
A
1386 goto ExitRoutine;
1387 }
1388
0a7de745 1389 if (uap->nent < 1 || uap->nent > AIO_LISTIO_MAX) {
f427ee49 1390 result = EINVAL;
91447636
A
1391 goto ExitRoutine;
1392 }
b0d623f7
A
1393
1394 /*
1395 * Use sigevent passed in to lio_listio for each of our calls, but
1396 * only do completion notification after the last request completes.
1397 */
b0d623f7 1398 if (uap->sigp != USER_ADDR_NULL) {
f427ee49
A
1399 result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1400 if (result) {
b0d623f7 1401 goto ExitRoutine;
0a7de745 1402 }
f427ee49
A
1403 result = aio_sigev_validate(&aiosigev);
1404 if (result) {
cb323159
A
1405 goto ExitRoutine;
1406 }
91447636
A
1407 }
1408
f427ee49
A
1409 if (aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1410 result = EAGAIN;
1411 goto ExitRoutine;
1412 }
0a7de745 1413
f427ee49
A
1414 /*
1415 * allocate/parse all entries
1416 */
1417 for (int i = 0; i < uap->nent; i++) {
1418 aio_workq_entry *entryp;
0a7de745 1419
55e303ae 1420 /* NULL elements are legal so check for 'em */
f427ee49 1421 if (aiocbpp[i] == USER_ADDR_NULL) {
55e303ae 1422 continue;
b0d623f7 1423 }
55e303ae 1424
f427ee49 1425 entryp = aio_create_queue_entry(p, aiocbpp[i], AIO_LIO);
0a7de745 1426 if (entryp == NULL) {
f427ee49
A
1427 result = EAGAIN;
1428 goto ExitRoutine;
b0d623f7 1429 }
0a7de745 1430
f427ee49
A
1431 /*
1432 * This refcount is cleaned up on exit if the entry
1433 * isn't submitted
1434 */
1435 entries[lio_count++] = entryp;
0a7de745 1436 if (uap->mode == LIO_NOWAIT) {
b0d623f7
A
1437 /* Set signal hander, if any */
1438 entryp->aiocb.aio_sigevent = aiosigev;
55e303ae 1439 }
eb6b6ca3 1440 }
0a7de745 1441
f427ee49
A
1442 if (lio_count == 0) {
1443 /* There's nothing to submit */
eb6b6ca3 1444 goto ExitRoutine;
0a7de745
A
1445 }
1446
f427ee49
A
1447 /*
1448 * Past this point we're commited and will not bail out
1449 *
1450 * - keep a reference on the leader for LIO_WAIT
1451 * - perform the submissions and optionally wait
1452 */
1453
1454 aio_workq_entry *leader = entries[0];
eb6b6ca3 1455 if (uap->mode == LIO_WAIT) {
f427ee49
A
1456 aio_entry_ref(leader); /* consumed below */
1457 }
1458
1459 aio_proc_lock_spin(p);
1460
1461 for (int i = 0; i < lio_count; i++) {
1462 if (aio_try_enqueue_work_locked(p, entries[i], leader)) {
1463 entries[i] = NULL; /* the entry was submitted */
1464 } else {
1465 result = EAGAIN;
1466 }
1467 }
eb6b6ca3 1468
f427ee49
A
1469 if (uap->mode == LIO_WAIT && result == 0) {
1470 leader->flags |= AIO_LIO_WAIT;
0a7de745 1471
f427ee49 1472 while (leader->lio_pending) {
b0d623f7 1473 /* If we were interrupted, fail out (even if all finished) */
f427ee49
A
1474 if (msleep(leader, aio_proc_mutex(p),
1475 PCATCH | PRIBIO | PSPIN, "lio_listio", 0) != 0) {
1476 result = EINTR;
b0d623f7 1477 break;
0a7de745 1478 }
b0d623f7
A
1479 }
1480
f427ee49 1481 leader->flags &= ~AIO_LIO_WAIT;
b0d623f7 1482 }
0a7de745 1483
f427ee49
A
1484 aio_proc_unlock(p);
1485
1486 if (uap->mode == LIO_WAIT) {
1487 aio_entry_unref(leader);
55e303ae
A
1488 }
1489
0a7de745 1490ExitRoutine:
f427ee49
A
1491 /* Consume unsubmitted entries */
1492 for (int i = 0; i < lio_count; i++) {
1493 if (entries[i]) {
1494 aio_entry_unref(entries[i]);
1495 }
b0d623f7 1496 }
0a7de745 1497
eb6b6ca3 1498 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_END,
f427ee49 1499 VM_KERNEL_ADDRPERM(p), result, 0, 0, 0);
0a7de745 1500
f427ee49
A
1501 return result;
1502}
55e303ae
A
1503
1504
1505/*
1506 * aio worker thread. this is where all the real work gets done.
0a7de745 1507 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
55e303ae
A
1508 * after new work is queued up.
1509 */
39037602 1510__attribute__((noreturn))
55e303ae 1511static void
f427ee49 1512aio_work_thread(void *arg __unused, wait_result_t wr __unused)
55e303ae 1513{
eb6b6ca3
A
1514 aio_workq_entry *entryp;
1515 int error;
1516 vm_map_t currentmap;
1517 vm_map_t oldmap = VM_MAP_NULL;
1518 task_t oldaiotask = TASK_NULL;
0a7de745 1519 struct uthread *uthreadp = NULL;
f427ee49 1520 proc_t p = NULL;
0a7de745
A
1521
1522 for (;;) {
1523 /*
b0d623f7 1524 * returns with the entry ref'ed.
0a7de745 1525 * sleeps until work is available.
b0d623f7 1526 */
0a7de745 1527 entryp = aio_get_some_work();
f427ee49 1528 p = entryp->procp;
b0d623f7 1529
eb6b6ca3
A
1530 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_START,
1531 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1532 entryp->flags, 0, 0);
b0d623f7
A
1533
1534 /*
1535 * Assume the target's address space identity for the duration
1536 * of the IO. Note: don't need to have the entryp locked,
1537 * because the proc and map don't change until it's freed.
1538 */
f427ee49 1539 currentmap = get_task_map((current_proc())->task);
0a7de745 1540 if (currentmap != entryp->aio_map) {
b0d623f7
A
1541 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1542 oldaiotask = uthreadp->uu_aio_task;
f427ee49
A
1543 /*
1544 * workq entries at this stage cause _aio_exec() and _aio_exit() to
1545 * block until we hit `do_aio_completion_and_unlock()` below,
1546 * which means that it is safe to dereference p->task without
1547 * holding a lock or taking references.
1548 */
1549 uthreadp->uu_aio_task = p->task;
1550 oldmap = vm_map_switch(entryp->aio_map);
b0d623f7
A
1551 }
1552
0a7de745 1553 if ((entryp->flags & AIO_READ) != 0) {
f427ee49 1554 error = do_aio_read(entryp);
0a7de745 1555 } else if ((entryp->flags & AIO_WRITE) != 0) {
f427ee49 1556 error = do_aio_write(entryp);
0a7de745 1557 } else if ((entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0) {
f427ee49 1558 error = do_aio_fsync(entryp);
0a7de745 1559 } else {
b0d623f7
A
1560 error = EINVAL;
1561 }
91447636 1562
b0d623f7 1563 /* Restore old map */
0a7de745 1564 if (currentmap != entryp->aio_map) {
f427ee49 1565 vm_map_switch(oldmap);
b0d623f7
A
1566 uthreadp->uu_aio_task = oldaiotask;
1567 }
55e303ae 1568
f427ee49
A
1569 /* liberate unused map */
1570 vm_map_deallocate(entryp->aio_map);
1571 entryp->aio_map = VM_MAP_NULL;
1572
eb6b6ca3
A
1573 KERNEL_DEBUG(SDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_END,
1574 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1575 entryp->errorval, entryp->returnval, 0);
0a7de745 1576
b0d623f7
A
1577 /* we're done with the IO request so pop it off the active queue and */
1578 /* push it on the done queue */
f427ee49
A
1579 aio_proc_lock(p);
1580 entryp->errorval = error;
1581 do_aio_completion_and_unlock(p, entryp);
1582 }
1583}
55e303ae
A
1584
1585
1586/*
1587 * aio_get_some_work - get the next async IO request that is ready to be executed.
1588 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1589 * IO requests at the time the aio_fsync call came in have completed.
91447636 1590 * NOTE - AIO_LOCK must be held by caller
55e303ae 1591 */
55e303ae 1592static aio_workq_entry *
f427ee49 1593aio_get_some_work(void)
55e303ae 1594{
eb6b6ca3
A
1595 aio_workq_entry *entryp = NULL;
1596 aio_workq_t queue = NULL;
b0d623f7
A
1597
1598 /* Just one queue for the moment. In the future there will be many. */
0a7de745 1599 queue = &aio_anchor.aio_async_workqs[0];
b0d623f7 1600 aio_workq_lock_spin(queue);
b0d623f7 1601
0a7de745 1602 /*
b0d623f7
A
1603 * Hold the queue lock.
1604 *
1605 * pop some work off the work queue and add to our active queue
0a7de745 1606 * Always start with the queue lock held.
b0d623f7 1607 */
f427ee49 1608 while ((entryp = TAILQ_FIRST(&queue->aioq_entries))) {
0a7de745 1609 /*
b0d623f7
A
1610 * Pull of of work queue. Once it's off, it can't be cancelled,
1611 * so we can take our ref once we drop the queue lock.
1612 */
b0d623f7
A
1613
1614 aio_workq_remove_entry_locked(queue, entryp);
0a7de745 1615
b0d623f7
A
1616 aio_workq_unlock(queue);
1617
0a7de745 1618 /*
b0d623f7
A
1619 * Check if it's an fsync that must be delayed. No need to lock the entry;
1620 * that flag would have been set at initialization.
1621 */
0a7de745
A
1622 if ((entryp->flags & AIO_FSYNC) != 0) {
1623 /*
b0d623f7
A
1624 * Check for unfinished operations on the same file
1625 * in this proc's queue.
1626 */
1627 aio_proc_lock_spin(entryp->procp);
f427ee49 1628 if (aio_delay_fsync_request(entryp)) {
b0d623f7 1629 /* It needs to be delayed. Put it back on the end of the work queue */
eb6b6ca3
A
1630 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) | DBG_FUNC_NONE,
1631 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1632 0, 0, 0);
b0d623f7
A
1633
1634 aio_proc_unlock(entryp->procp);
1635
1636 aio_workq_lock_spin(queue);
1637 aio_workq_add_entry_locked(queue, entryp);
55e303ae 1638 continue;
0a7de745 1639 }
b0d623f7 1640 aio_proc_unlock(entryp->procp);
55e303ae 1641 }
0a7de745 1642
f427ee49 1643 return entryp;
55e303ae 1644 }
b0d623f7 1645
b0d623f7 1646 /* We will wake up when someone enqueues something */
3e170ce0 1647 waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
b0d623f7 1648 aio_workq_unlock(queue);
f427ee49 1649 thread_block(aio_work_thread);
b0d623f7 1650
f427ee49 1651 __builtin_unreachable();
b0d623f7 1652}
55e303ae
A
1653
1654/*
b0d623f7
A
1655 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1656 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1657 * not been completed.
55e303ae
A
1658 */
1659static boolean_t
f427ee49 1660aio_delay_fsync_request(aio_workq_entry *entryp)
55e303ae 1661{
eb6b6ca3
A
1662 if (proc_in_teardown(entryp->procp)) {
1663 /*
1664 * we can't delay FSYNCS when in teardown as it will confuse _aio_exit,
1665 * if it was dequeued, then we must now commit to it
1666 */
1667 return FALSE;
1668 }
1669
b0d623f7
A
1670 if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1671 return FALSE;
55e303ae 1672 }
0a7de745 1673
b0d623f7 1674 return TRUE;
f427ee49 1675}
55e303ae 1676
b0d623f7 1677static aio_workq_entry *
f427ee49 1678aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t flags)
55e303ae 1679{
0a7de745 1680 aio_workq_entry *entryp;
91447636 1681
f427ee49 1682 entryp = zalloc_flags(aio_workq_zonep, Z_WAITOK | Z_ZERO);
55e303ae
A
1683 entryp->procp = procp;
1684 entryp->uaiocbp = aiocbp;
f427ee49
A
1685 entryp->flags = flags;
1686 /* consumed in aio_return or _aio_exit */
1687 os_ref_init(&entryp->aio_refcount, &aio_refgrp);
91447636 1688
0a7de745 1689 if (proc_is64bit(procp)) {
b0d623f7 1690 struct user64_aiocb aiocb64;
0a7de745 1691
f427ee49
A
1692 if (copyin(aiocbp, &aiocb64, sizeof(aiocb64)) != 0) {
1693 goto error_exit;
0a7de745 1694 }
f427ee49 1695 do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
b0d623f7
A
1696 } else {
1697 struct user32_aiocb aiocb32;
0a7de745 1698
f427ee49
A
1699 if (copyin(aiocbp, &aiocb32, sizeof(aiocb32)) != 0) {
1700 goto error_exit;
0a7de745 1701 }
f427ee49 1702 do_munge_aiocb_user32_to_user(&aiocb32, &entryp->aiocb);
55e303ae
A
1703 }
1704
f427ee49
A
1705 /* do some more validation on the aiocb and embedded file descriptor */
1706 if (aio_validate(procp, entryp) != 0) {
55e303ae 1707 goto error_exit;
b0d623f7 1708 }
55e303ae
A
1709
1710 /* get a reference to the user land map in order to keep it around */
f427ee49
A
1711 entryp->aio_map = get_task_map(procp->task);
1712 vm_map_reference(entryp->aio_map);
39236c6e
A
1713
1714 /* get a reference on the current_thread, which is passed in vfs_context. */
1715 entryp->thread = current_thread();
f427ee49 1716 thread_reference(entryp->thread);
0a7de745 1717 return entryp;
b0d623f7 1718
55e303ae 1719error_exit:
f427ee49
A
1720 zfree(aio_workq_zonep, entryp);
1721 return NULL;
b0d623f7 1722}
55e303ae
A
1723
1724
1725/*
b0d623f7
A
1726 * aio_queue_async_request - queue up an async IO request on our work queue then
1727 * wake up one of our worker threads to do the actual work. We get a reference
1728 * to our caller's user land map in order to keep it around while we are
0a7de745 1729 * processing the request.
55e303ae 1730 */
b0d623f7 1731static int
f427ee49
A
1732aio_queue_async_request(proc_t procp, user_addr_t aiocbp,
1733 aio_entry_flags_t flags)
55e303ae 1734{
0a7de745
A
1735 aio_workq_entry *entryp;
1736 int result;
b0d623f7 1737
f427ee49 1738 entryp = aio_create_queue_entry(procp, aiocbp, flags);
0a7de745 1739 if (entryp == NULL) {
b0d623f7
A
1740 result = EAGAIN;
1741 goto error_noalloc;
55e303ae 1742 }
55e303ae 1743
b0d623f7 1744 aio_proc_lock_spin(procp);
f427ee49 1745 if (!aio_try_enqueue_work_locked(procp, entryp, NULL)) {
0a7de745 1746 result = EAGAIN;
55e303ae
A
1747 goto error_exit;
1748 }
b0d623f7 1749 aio_proc_unlock(procp);
0a7de745
A
1750 return 0;
1751
b0d623f7
A
1752error_exit:
1753 /*
1754 * This entry has not been queued up so no worries about
1755 * unlocked state and aio_map
1756 */
1757 aio_proc_unlock(procp);
1758 aio_free_request(entryp);
b0d623f7 1759error_noalloc:
0a7de745 1760 return result;
f427ee49 1761}
55e303ae
A
1762
1763
1764/*
1765 * aio_free_request - remove our reference on the user land map and
b0d623f7
A
1766 * free the work queue entry resources. The entry is off all lists
1767 * and has zero refcount, so no one can have a pointer to it.
55e303ae 1768 */
f427ee49 1769static void
b0d623f7 1770aio_free_request(aio_workq_entry *entryp)
55e303ae 1771{
f427ee49
A
1772 if (entryp->aio_proc_link.tqe_prev || entryp->aio_workq_link.tqe_prev) {
1773 panic("aio_workq_entry %p being freed while still enqueued", entryp);
1774 }
1775
55e303ae 1776 /* remove our reference to the user land map. */
0a7de745 1777 if (VM_MAP_NULL != entryp->aio_map) {
b0d623f7 1778 vm_map_deallocate(entryp->aio_map);
55e303ae 1779 }
b0d623f7 1780
39236c6e 1781 /* remove our reference to thread which enqueued the request */
0a7de745 1782 if (NULL != entryp->thread) {
f427ee49 1783 thread_deallocate(entryp->thread);
39236c6e
A
1784 }
1785
f427ee49
A
1786 zfree(aio_workq_zonep, entryp);
1787}
55e303ae
A
1788
1789
b0d623f7
A
1790/*
1791 * aio_validate
1792 *
1793 * validate the aiocb passed in by one of the aio syscalls.
55e303ae 1794 */
55e303ae 1795static int
f427ee49 1796aio_validate(proc_t p, aio_workq_entry *entryp)
55e303ae 1797{
eb6b6ca3
A
1798 struct fileproc *fp;
1799 int flag;
1800 int result;
0a7de745 1801
55e303ae
A
1802 result = 0;
1803
0a7de745
A
1804 if ((entryp->flags & AIO_LIO) != 0) {
1805 if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
55e303ae 1806 entryp->flags |= AIO_READ;
0a7de745 1807 } else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
55e303ae 1808 entryp->flags |= AIO_WRITE;
0a7de745
A
1809 } else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
1810 return 0;
1811 } else {
1812 return EINVAL;
1813 }
55e303ae
A
1814 }
1815
1816 flag = FREAD;
0a7de745 1817 if ((entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0) {
55e303ae
A
1818 flag = FWRITE;
1819 }
1820
0a7de745
A
1821 if ((entryp->flags & (AIO_READ | AIO_WRITE)) != 0) {
1822 if (entryp->aiocb.aio_nbytes > INT_MAX ||
1823 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1824 entryp->aiocb.aio_offset < 0) {
1825 return EINVAL;
1826 }
55e303ae
A
1827 }
1828
cb323159
A
1829 result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
1830 if (result) {
1831 return result;
b0d623f7 1832 }
0a7de745 1833
55e303ae 1834 /* validate the file descriptor and that the file was opened
91447636 1835 * for the appropriate read / write access.
55e303ae 1836 */
f427ee49 1837 proc_fdlock(p);
91447636 1838
f427ee49
A
1839 fp = fp_get_noref_locked(p, entryp->aiocb.aio_fildes);
1840 if (fp == NULL) {
55e303ae 1841 result = EBADF;
f427ee49
A
1842 } else if ((fp->fp_glob->fg_flag & flag) == 0) {
1843 /* we don't have read or write access */
1844 result = EBADF;
1845 } else if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_VNODE) {
1846 /* this is not a file */
1847 result = ESPIPE;
1848 } else {
1849 fp->fp_flags |= FP_AIOISSUED;
55e303ae 1850 }
55e303ae 1851
f427ee49 1852 proc_fdunlock(p);
55e303ae 1853
0a7de745 1854 return result;
b0d623f7 1855}
55e303ae 1856
55e303ae 1857/*
f427ee49 1858 * do_aio_completion_and_unlock. Handle async IO completion.
55e303ae 1859 */
55e303ae 1860static void
f427ee49 1861do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp)
55e303ae 1862{
f427ee49
A
1863 aio_workq_entry *leader = entryp->lio_leader;
1864 int lio_pending = 0;
1865 bool do_signal = false;
0a7de745 1866
f427ee49 1867 ASSERT_AIO_PROC_LOCK_OWNED(p);
b0d623f7 1868
f427ee49 1869 aio_proc_move_done_locked(p, entryp);
0a7de745 1870
f427ee49
A
1871 if (leader) {
1872 lio_pending = --leader->lio_pending;
1873 if (lio_pending < 0) {
1874 panic("lio_pending accounting mistake");
0a7de745 1875 }
f427ee49
A
1876 if (lio_pending == 0 && (leader->flags & AIO_LIO_WAIT)) {
1877 wakeup(leader);
b0d623f7 1878 }
f427ee49 1879 entryp->lio_leader = NULL; /* no dangling pointers please */
b0d623f7 1880 }
0a7de745 1881
55e303ae 1882 /*
b0d623f7
A
1883 * need to handle case where a process is trying to exit, exec, or
1884 * close and is currently waiting for active aio requests to complete.
0a7de745
A
1885 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
1886 * other requests in the active queue for this process. If there are
b0d623f7
A
1887 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
1888 * If there are some still active then do nothing - we only want to
0a7de745 1889 * wakeup when all active aio requests for the process are complete.
55e303ae 1890 */
f427ee49 1891 if (__improbable(entryp->flags & AIO_EXIT_WAIT)) {
eb6b6ca3
A
1892 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1893 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1894 0, 0, 0);
55e303ae 1895
f427ee49 1896 if (!aio_has_active_requests_for_process(p)) {
0a7de745 1897 /*
b0d623f7
A
1898 * no active aio requests for this process, continue exiting. In this
1899 * case, there should be no one else waiting ont he proc in AIO...
1900 */
f427ee49 1901 wakeup_one((caddr_t)&p->AIO_CLEANUP_SLEEP_CHAN);
55e303ae 1902
eb6b6ca3
A
1903 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1904 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1905 0, 0, 0);
55e303ae 1906 }
f427ee49
A
1907 } else if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
1908 /*
1909 * If this was the last request in the group, or not part of
1910 * a group, and that a signal is desired, send one.
1911 */
1912 do_signal = (lio_pending == 0);
55e303ae
A
1913 }
1914
f427ee49 1915 if (__improbable(entryp->flags & AIO_CLOSE_WAIT)) {
eb6b6ca3
A
1916 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1917 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1918 0, 0, 0);
0a7de745 1919
f427ee49 1920 if (!aio_proc_has_active_requests_for_file(p, entryp->aiocb.aio_fildes)) {
b0d623f7 1921 /* Can't wakeup_one(); multiple closes might be in progress. */
f427ee49 1922 wakeup(&p->AIO_CLEANUP_SLEEP_CHAN);
b0d623f7 1923
eb6b6ca3
A
1924 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1925 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1926 0, 0, 0);
b0d623f7
A
1927 }
1928 }
f427ee49
A
1929
1930 aio_proc_unlock(p);
1931
1932 if (do_signal) {
1933 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig) | DBG_FUNC_NONE,
1934 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1935 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0);
1936
1937 psignal(p, entryp->aiocb.aio_sigevent.sigev_signo);
1938 }
1939
0a7de745 1940 /*
b0d623f7
A
1941 * A thread in aio_suspend() wants to known about completed IOs. If it checked
1942 * the done list before we moved our AIO there, then it already asserted its wait,
1943 * and we can wake it up without holding the lock. If it checked the list after
1944 * we did our move, then it already has seen the AIO that we moved. Herego, we
1945 * can do our wakeup without holding the lock.
55e303ae 1946 */
f427ee49 1947 wakeup(&p->AIO_SUSPEND_SLEEP_CHAN);
eb6b6ca3
A
1948 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake) | DBG_FUNC_NONE,
1949 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), 0, 0, 0);
55e303ae 1950
f427ee49
A
1951 aio_entry_unref(entryp); /* see aio_try_enqueue_work_locked */
1952 if (leader) {
1953 aio_entry_unref(leader); /* see lio_listio */
0a7de745 1954 }
f427ee49 1955}
55e303ae
A
1956
1957
1958/*
1959 * do_aio_read
1960 */
1961static int
f427ee49 1962do_aio_read(aio_workq_entry *entryp)
55e303ae 1963{
f427ee49
A
1964 struct proc *p = entryp->procp;
1965 struct fileproc *fp;
1966 int error;
55e303ae 1967
f427ee49 1968 if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
0a7de745
A
1969 return error;
1970 }
2d21ac55 1971
f427ee49
A
1972 if (fp->fp_glob->fg_flag & FREAD) {
1973 struct vfs_context context = {
1974 .vc_thread = entryp->thread, /* XXX */
1975 .vc_ucred = fp->fp_glob->fg_cred,
1976 };
2d21ac55 1977
f427ee49
A
1978 error = dofileread(&context, fp,
1979 entryp->aiocb.aio_buf,
1980 entryp->aiocb.aio_nbytes,
1981 entryp->aiocb.aio_offset, FOF_OFFSET,
1982 &entryp->returnval);
1983 } else {
1984 error = EBADF;
1985 }
0a7de745 1986
f427ee49 1987 fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
0a7de745 1988 return error;
f427ee49 1989}
55e303ae
A
1990
1991
1992/*
1993 * do_aio_write
1994 */
1995static int
f427ee49 1996do_aio_write(aio_workq_entry *entryp)
55e303ae 1997{
f427ee49
A
1998 struct proc *p = entryp->procp;
1999 struct fileproc *fp;
2000 int error;
55e303ae 2001
f427ee49 2002 if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
0a7de745
A
2003 return error;
2004 }
b0d623f7 2005
f427ee49
A
2006 if (fp->fp_glob->fg_flag & FWRITE) {
2007 struct vfs_context context = {
2008 .vc_thread = entryp->thread, /* XXX */
2009 .vc_ucred = fp->fp_glob->fg_cred,
2010 };
2011 int flags = FOF_PCRED;
2d21ac55 2012
f427ee49
A
2013 if ((fp->fp_glob->fg_flag & O_APPEND) == 0) {
2014 flags |= FOF_OFFSET;
2015 }
0a7de745 2016
f427ee49
A
2017 /* NB: tell dofilewrite the offset, and to use the proc cred */
2018 error = dofilewrite(&context,
2019 fp,
2020 entryp->aiocb.aio_buf,
2021 entryp->aiocb.aio_nbytes,
2022 entryp->aiocb.aio_offset,
2023 flags,
2024 &entryp->returnval);
0a7de745 2025 } else {
f427ee49 2026 error = EBADF;
0a7de745 2027 }
55e303ae 2028
f427ee49 2029 fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
0a7de745 2030 return error;
f427ee49 2031}
55e303ae
A
2032
2033
2034/*
f427ee49
A
2035 * aio_has_active_requests_for_process - return whether the process has active
2036 * requests pending.
55e303ae 2037 */
f427ee49
A
2038static bool
2039aio_has_active_requests_for_process(proc_t procp)
55e303ae 2040{
f427ee49
A
2041 return !TAILQ_EMPTY(&procp->p_aio_activeq);
2042}
b0d623f7
A
2043
2044/*
2045 * Called with the proc locked.
2046 */
f427ee49
A
2047static bool
2048aio_proc_has_active_requests_for_file(proc_t procp, int fd)
b0d623f7 2049{
b0d623f7 2050 aio_workq_entry *entryp;
f427ee49 2051
b0d623f7
A
2052 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2053 if (entryp->aiocb.aio_fildes == fd) {
f427ee49 2054 return true;
b0d623f7
A
2055 }
2056 }
55e303ae 2057
f427ee49
A
2058 return false;
2059}
55e303ae 2060
b0d623f7 2061
55e303ae
A
2062/*
2063 * do_aio_fsync
2064 */
2065static int
f427ee49 2066do_aio_fsync(aio_workq_entry *entryp)
55e303ae 2067{
f427ee49
A
2068 struct proc *p = entryp->procp;
2069 struct vnode *vp;
2070 struct fileproc *fp;
0a7de745
A
2071 int sync_flag;
2072 int error;
91447636 2073
b0d623f7
A
2074 /*
2075 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2076 *
2077 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2078 * to mark for update the metadata not strictly necessary for data
2079 * retrieval, rather than forcing it to disk.
2080 *
2081 * If AIO_FSYNC is set, we have to also wait for metadata not really
2082 * necessary to data retrival are committed to stable storage (e.g.
2083 * atime, mtime, ctime, etc.).
2084 *
2085 * Metadata necessary for data retrieval ust be committed to stable
2086 * storage in either case (file length, etc.).
2087 */
0a7de745 2088 if (entryp->flags & AIO_FSYNC) {
b0d623f7 2089 sync_flag = MNT_WAIT;
0a7de745 2090 } else {
b0d623f7 2091 sync_flag = MNT_DWAIT;
0a7de745
A
2092 }
2093
f427ee49
A
2094 error = fp_get_ftype(p, entryp->aiocb.aio_fildes, DTYPE_VNODE, ENOTSUP, &fp);
2095 if (error != 0) {
2096 entryp->returnval = -1;
2097 return error;
2098 }
2099 vp = fp->fp_glob->fg_data;
91447636 2100
f427ee49
A
2101 if ((error = vnode_getwithref(vp)) == 0) {
2102 struct vfs_context context = {
2103 .vc_thread = entryp->thread, /* XXX */
2104 .vc_ucred = fp->fp_glob->fg_cred,
2105 };
91447636 2106
f427ee49 2107 error = VNOP_FSYNC(vp, sync_flag, &context);
91447636 2108
f427ee49
A
2109 (void)vnode_put(vp);
2110 } else {
55e303ae 2111 entryp->returnval = -1;
0a7de745 2112 }
55e303ae 2113
f427ee49 2114 fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
0a7de745 2115 return error;
f427ee49 2116}
55e303ae
A
2117
2118
2119/*
0a7de745 2120 * is_already_queued - runs through our queues to see if the given
55e303ae
A
2121 * aiocbp / process is there. Returns TRUE if there is a match
2122 * on any of our aio queues.
b0d623f7
A
2123 *
2124 * Called with proc aio lock held (can be held spin)
55e303ae 2125 */
55e303ae 2126static boolean_t
eb6b6ca3 2127is_already_queued(proc_t procp, user_addr_t aiocbp)
55e303ae 2128{
eb6b6ca3
A
2129 aio_workq_entry *entryp;
2130 boolean_t result;
0a7de745 2131
55e303ae 2132 result = FALSE;
0a7de745 2133
55e303ae 2134 /* look for matches on our queue of async IO requests that have completed */
f427ee49 2135 TAILQ_FOREACH(entryp, &procp->p_aio_doneq, aio_proc_link) {
0a7de745 2136 if (aiocbp == entryp->uaiocbp) {
55e303ae
A
2137 result = TRUE;
2138 goto ExitThisRoutine;
2139 }
2140 }
0a7de745 2141
55e303ae 2142 /* look for matches on our queue of active async IO requests */
f427ee49 2143 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
0a7de745 2144 if (aiocbp == entryp->uaiocbp) {
55e303ae
A
2145 result = TRUE;
2146 goto ExitThisRoutine;
2147 }
2148 }
0a7de745 2149
55e303ae 2150ExitThisRoutine:
0a7de745 2151 return result;
f427ee49 2152}
b0d623f7
A
2153
2154
55e303ae
A
2155/*
2156 * aio initialization
2157 */
2158__private_extern__ void
f427ee49 2159aio_init(void)
55e303ae 2160{
f427ee49 2161 for (int i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
b0d623f7
A
2162 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2163 }
2164
f427ee49
A
2165 _aio_create_worker_threads(aio_worker_threads);
2166}
55e303ae
A
2167
2168
2169/*
2170 * aio worker threads created here.
2171 */
2172__private_extern__ void
f427ee49 2173_aio_create_worker_threads(int num)
55e303ae 2174{
eb6b6ca3 2175 int i;
0a7de745 2176
55e303ae 2177 /* create some worker threads to handle the async IO requests */
0a7de745
A
2178 for (i = 0; i < num; i++) {
2179 thread_t myThread;
2180
f427ee49
A
2181 if (KERN_SUCCESS != kernel_thread_start(aio_work_thread, NULL, &myThread)) {
2182 printf("%s - failed to create a work thread \n", __FUNCTION__);
0a7de745 2183 } else {
b0d623f7 2184 thread_deallocate(myThread);
0a7de745 2185 }
55e303ae 2186 }
f427ee49 2187}
55e303ae
A
2188
2189/*
2190 * Return the current activation utask
2191 */
2192task_t
2193get_aiotask(void)
2194{
0a7de745 2195 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
91447636
A
2196}
2197
2198
2199/*
2200 * In the case of an aiocb from a
2201 * 32-bit process we need to expand some longs and pointers to the correct
2202 * sizes in order to let downstream code always work on the same type of
2203 * aiocb (in our case that is a user_aiocb)
2204 */
0a7de745 2205static void
f427ee49 2206do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
91447636
A
2207{
2208 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2209 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2210 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2211 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2212 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2213 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2214
2215 /* special case here. since we do not know if sigev_value is an */
2216 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2217 /* means if we send this info back to user space we need to remember */
2218 /* sigev_value was not expanded for the 32-bit case. */
2219 /* NOTE - this does NOT affect us since we don't support sigev_value */
2220 /* yet in the aio context. */
2221 //LP64
2222 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2223 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
0a7de745
A
2224 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2225 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2226 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2227 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2228 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2229 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
55e303ae 2230}
b0d623f7
A
2231
2232/* Similar for 64-bit user process, so that we don't need to satisfy
2233 * the alignment constraints of the original user64_aiocb
2234 */
f427ee49
A
2235#if !__LP64__
2236__dead2
2237#endif
0a7de745 2238static void
f427ee49 2239do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
b0d623f7 2240{
f427ee49 2241#if __LP64__
b0d623f7
A
2242 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2243 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2244 the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2245 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2246 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2247 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
0a7de745 2248
b0d623f7
A
2249 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2250 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
0a7de745
A
2251 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2252 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2253 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2254 my_aiocbp->aio_sigevent.sigev_notify_function;
2255 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2256 my_aiocbp->aio_sigevent.sigev_notify_attributes;
f427ee49
A
2257#else
2258#pragma unused(my_aiocbp, the_user_aiocbp)
2259 panic("64bit process on 32bit kernel is not supported");
2260#endif
b0d623f7 2261}