]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/kern/kern_aio.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2003-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30/*
31 * todo:
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
38 */
39
40
41/*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45#include <sys/systm.h>
46#include <sys/fcntl.h>
47#include <sys/file_internal.h>
48#include <sys/filedesc.h>
49#include <sys/kernel.h>
50#include <sys/vnode_internal.h>
51#include <sys/malloc.h>
52#include <sys/mount_internal.h>
53#include <sys/param.h>
54#include <sys/proc_internal.h>
55#include <sys/sysctl.h>
56#include <sys/unistd.h>
57#include <sys/user.h>
58
59#include <sys/aio_kern.h>
60#include <sys/sysproto.h>
61
62#include <machine/limits.h>
63
64#include <mach/mach_types.h>
65#include <kern/kern_types.h>
66#include <kern/waitq.h>
67#include <kern/zalloc.h>
68#include <kern/task.h>
69#include <kern/sched_prim.h>
70
71#include <vm/vm_map.h>
72
73#include <os/refcnt.h>
74
75#include <sys/kdebug.h>
76#define AIO_work_queued 1
77#define AIO_worker_wake 2
78#define AIO_completion_sig 3
79#define AIO_completion_cleanup_wait 4
80#define AIO_completion_cleanup_wake 5
81#define AIO_completion_suspend_wake 6
82#define AIO_fsync_delay 7
83#define AIO_cancel 10
84#define AIO_cancel_async_workq 11
85#define AIO_cancel_sync_workq 12
86#define AIO_cancel_activeq 13
87#define AIO_cancel_doneq 14
88#define AIO_fsync 20
89#define AIO_read 30
90#define AIO_write 40
91#define AIO_listio 50
92#define AIO_error 60
93#define AIO_error_val 61
94#define AIO_error_activeq 62
95#define AIO_error_workq 63
96#define AIO_return 70
97#define AIO_return_val 71
98#define AIO_return_activeq 72
99#define AIO_return_workq 73
100#define AIO_exec 80
101#define AIO_exit 90
102#define AIO_exit_sleep 91
103#define AIO_close 100
104#define AIO_close_sleep 101
105#define AIO_suspend 110
106#define AIO_suspend_sleep 111
107#define AIO_worker_thread 120
108
109__options_decl(aio_entry_flags_t, uint32_t, {
110 AIO_READ = 0x00000001, /* a read */
111 AIO_WRITE = 0x00000002, /* a write */
112 AIO_FSYNC = 0x00000004, /* aio_fsync with op = O_SYNC */
113 AIO_DSYNC = 0x00000008, /* aio_fsync with op = O_DSYNC (not supported yet) */
114 AIO_LIO = 0x00000010, /* lio_listio generated IO */
115 AIO_LIO_WAIT = 0x00000020, /* lio_listio is waiting on the leader */
116
117 /*
118 * These flags mean that this entry is blocking either:
119 * - close (AIO_CLOSE_WAIT)
120 * - exit or exec (AIO_EXIT_WAIT)
121 *
122 * These flags are mutually exclusive, and the AIO_EXIT_WAIT variant
123 * will also neuter notifications in do_aio_completion_and_unlock().
124 */
125 AIO_CLOSE_WAIT = 0x00004000,
126 AIO_EXIT_WAIT = 0x00008000,
127});
128
129/*! @struct aio_workq_entry
130 *
131 * @discussion
132 * This represents a piece of aio/lio work.
133 *
134 * The ownership rules go as follows:
135 *
136 * - the "proc" owns one refcount on the entry (from creation), while it is
137 * enqueued on the aio_activeq and then the aio_doneq.
138 *
139 * either aio_return() (user read the status) or _aio_exit() (the process
140 * died) will dequeue the entry and consume this ref.
141 *
142 * - the async workqueue owns one refcount once the work is submitted,
143 * which is consumed in do_aio_completion_and_unlock().
144 *
145 * This ref protects the entry for the the end of
146 * do_aio_completion_and_unlock() (when signal delivery happens).
147 *
148 * - lio_listio() for batches picks one of the entries to be the "leader"
149 * of the batch. Each work item will have a refcount on its leader
150 * so that the accounting of the batch completion can be done on the leader
151 * (to be able to decrement lio_pending).
152 *
153 * This ref is consumed in do_aio_completion_and_unlock() as well.
154 *
155 * - lastly, in lio_listio() when the LIO_WAIT behavior is requested,
156 * an extra ref is taken in this syscall as it needs to keep accessing
157 * the leader "lio_pending" field until it hits 0.
158 */
159struct aio_workq_entry {
160 /* queue lock */
161 TAILQ_ENTRY(aio_workq_entry) aio_workq_link;
162
163 /* Proc lock */
164 TAILQ_ENTRY(aio_workq_entry) aio_proc_link; /* p_aio_activeq or p_aio_doneq */
165 user_ssize_t returnval; /* return value from read / write request */
166 errno_t errorval; /* error value from read / write request */
167 os_refcnt_t aio_refcount;
168 aio_entry_flags_t flags;
169
170 int lio_pending; /* pending I/Os in lio group, only on leader */
171 struct aio_workq_entry *lio_leader; /* pointer to the lio leader, can be self */
172
173 /* Initialized and never changed, safe to access */
174 struct proc *procp; /* user proc that queued this request */
175 user_addr_t uaiocbp; /* pointer passed in from user land */
176 struct user_aiocb aiocb; /* copy of aiocb from user land */
177 thread_t thread; /* thread that queued this request */
178
179 /* Initialized, and possibly freed by aio_work_thread() or at free if cancelled */
180 vm_map_t aio_map; /* user land map we have a reference to */
181};
182
183/*
184 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
185 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
186 * (proc.aio_activeq) when one of our worker threads start the IO.
187 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
188 * when the IO request completes. The request remains on aio_doneq until
189 * user process calls aio_return or the process exits, either way that is our
190 * trigger to release aio resources.
191 */
192typedef struct aio_workq {
193 TAILQ_HEAD(, aio_workq_entry) aioq_entries;
194 lck_spin_t aioq_lock;
195 struct waitq aioq_waitq;
196} *aio_workq_t;
197
198#define AIO_NUM_WORK_QUEUES 1
199struct aio_anchor_cb {
200 os_atomic(int) aio_total_count; /* total extant entries */
201
202 /* Hash table of queues here */
203 int aio_num_workqs;
204 struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
205};
206typedef struct aio_anchor_cb aio_anchor_cb;
207
208/*
209 * Notes on aio sleep / wake channels.
210 * We currently pick a couple fields within the proc structure that will allow
211 * us sleep channels that currently do not collide with any other kernel routines.
212 * At this time, for binary compatibility reasons, we cannot create new proc fields.
213 */
214#define AIO_SUSPEND_SLEEP_CHAN p_aio_activeq
215#define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
216
217#define ASSERT_AIO_FROM_PROC(aiop, theproc) \
218 if ((aiop)->procp != (theproc)) { \
219 panic("AIO on a proc list that does not belong to that proc.\n"); \
220 }
221
222/*
223 * LOCAL PROTOTYPES
224 */
225static void aio_proc_lock(proc_t procp);
226static void aio_proc_lock_spin(proc_t procp);
227static void aio_proc_unlock(proc_t procp);
228static lck_mtx_t *aio_proc_mutex(proc_t procp);
229static bool aio_has_active_requests_for_process(proc_t procp);
230static bool aio_proc_has_active_requests_for_file(proc_t procp, int fd);
231static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp);
232
233static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
234static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
235static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
236static void aio_entry_ref(aio_workq_entry *entryp);
237static void aio_entry_unref(aio_workq_entry *entryp);
238static bool aio_entry_try_workq_remove(aio_workq_entry *entryp);
239static boolean_t aio_delay_fsync_request(aio_workq_entry *entryp);
240static void aio_free_request(aio_workq_entry *entryp);
241
242static void aio_workq_init(aio_workq_t wq);
243static void aio_workq_lock_spin(aio_workq_t wq);
244static void aio_workq_unlock(aio_workq_t wq);
245static lck_spin_t *aio_workq_lock(aio_workq_t wq);
246
247static void aio_work_thread(void *arg, wait_result_t wr);
248static aio_workq_entry *aio_get_some_work(void);
249
250static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
251static int aio_validate(proc_t, aio_workq_entry *entryp);
252
253static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, aio_entry_flags_t);
254static void do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp);
255static int do_aio_fsync(aio_workq_entry *entryp);
256static int do_aio_read(aio_workq_entry *entryp);
257static int do_aio_write(aio_workq_entry *entryp);
258static void do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
259static void do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
260static aio_workq_entry *aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
261static int aio_copy_in_list(proc_t, user_addr_t, user_addr_t *, int);
262
263#define ASSERT_AIO_PROC_LOCK_OWNED(p) LCK_MTX_ASSERT(aio_proc_mutex(p), LCK_MTX_ASSERT_OWNED)
264#define ASSERT_AIO_WORKQ_LOCK_OWNED(q) LCK_SPIN_ASSERT(aio_workq_lock(q), LCK_ASSERT_OWNED)
265
266/*
267 * EXTERNAL PROTOTYPES
268 */
269
270/* in ...bsd/kern/sys_generic.c */
271extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
272 user_addr_t bufp, user_size_t nbyte,
273 off_t offset, int flags, user_ssize_t *retval);
274extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
275 user_addr_t bufp, user_size_t nbyte, off_t offset,
276 int flags, user_ssize_t *retval);
277
278/*
279 * aio external global variables.
280 */
281extern int aio_max_requests; /* AIO_MAX - configurable */
282extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
283extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
284
285
286/*
287 * aio static variables.
288 */
289static aio_anchor_cb aio_anchor = {
290 .aio_num_workqs = AIO_NUM_WORK_QUEUES,
291};
292os_refgrp_decl(static, aio_refgrp, "aio", NULL);
293static LCK_GRP_DECLARE(aio_proc_lock_grp, "aio_proc");
294static LCK_GRP_DECLARE(aio_queue_lock_grp, "aio_queue");
295static LCK_MTX_DECLARE(aio_proc_mtx, &aio_proc_lock_grp);
296
297static ZONE_DECLARE(aio_workq_zonep, "aiowq", sizeof(aio_workq_entry),
298 ZC_ZFREE_CLEARMEM);
299
300/* Hash */
301static aio_workq_t
302aio_entry_workq(__unused aio_workq_entry *entryp)
303{
304 return &aio_anchor.aio_async_workqs[0];
305}
306
307static void
308aio_workq_init(aio_workq_t wq)
309{
310 TAILQ_INIT(&wq->aioq_entries);
311 lck_spin_init(&wq->aioq_lock, &aio_queue_lock_grp, LCK_ATTR_NULL);
312 waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
313}
314
315
316/*
317 * Can be passed a queue which is locked spin.
318 */
319static void
320aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
321{
322 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
323
324 if (entryp->aio_workq_link.tqe_prev == NULL) {
325 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
326 }
327
328 TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
329 entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
330}
331
332static void
333aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
334{
335 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
336
337 TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
338}
339
340static void
341aio_proc_lock(proc_t procp)
342{
343 lck_mtx_lock(aio_proc_mutex(procp));
344}
345
346static void
347aio_proc_lock_spin(proc_t procp)
348{
349 lck_mtx_lock_spin(aio_proc_mutex(procp));
350}
351
352static bool
353aio_has_any_work(void)
354{
355 return os_atomic_load(&aio_anchor.aio_total_count, relaxed) != 0;
356}
357
358static bool
359aio_try_proc_insert_active_locked(proc_t procp, aio_workq_entry *entryp)
360{
361 int old, new;
362
363 ASSERT_AIO_PROC_LOCK_OWNED(procp);
364
365 if (procp->p_aio_total_count >= aio_max_requests_per_process) {
366 return false;
367 }
368
369 if (is_already_queued(procp, entryp->uaiocbp)) {
370 return false;
371 }
372
373 os_atomic_rmw_loop(&aio_anchor.aio_total_count, old, new, relaxed, {
374 if (old >= aio_max_requests) {
375 os_atomic_rmw_loop_give_up(return false);
376 }
377 new = old + 1;
378 });
379
380 TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
381 procp->p_aio_total_count++;
382 return true;
383}
384
385static void
386aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
387{
388 TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link);
389 TAILQ_INSERT_TAIL(&procp->p_aio_doneq, entryp, aio_proc_link);
390}
391
392static void
393aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
394{
395 TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
396 entryp->aio_proc_link.tqe_prev = NULL;
397 if (os_atomic_dec_orig(&aio_anchor.aio_total_count, relaxed) <= 0) {
398 panic("Negative total AIO count!\n");
399 }
400 if (procp->p_aio_total_count-- <= 0) {
401 panic("proc %p: p_aio_total_count accounting mismatch", procp);
402 }
403}
404
405static void
406aio_proc_unlock(proc_t procp)
407{
408 lck_mtx_unlock(aio_proc_mutex(procp));
409}
410
411static lck_mtx_t*
412aio_proc_mutex(proc_t procp)
413{
414 return &procp->p_mlock;
415}
416
417static void
418aio_entry_ref(aio_workq_entry *entryp)
419{
420 os_ref_retain(&entryp->aio_refcount);
421}
422
423static void
424aio_entry_unref(aio_workq_entry *entryp)
425{
426 if (os_ref_release(&entryp->aio_refcount) == 0) {
427 aio_free_request(entryp);
428 }
429}
430
431static bool
432aio_entry_try_workq_remove(aio_workq_entry *entryp)
433{
434 /* Can only be cancelled if it's still on a work queue */
435 if (entryp->aio_workq_link.tqe_prev != NULL) {
436 aio_workq_t queue;
437
438 /* Will have to check again under the lock */
439 queue = aio_entry_workq(entryp);
440 aio_workq_lock_spin(queue);
441 if (entryp->aio_workq_link.tqe_prev != NULL) {
442 aio_workq_remove_entry_locked(queue, entryp);
443 aio_workq_unlock(queue);
444 return true;
445 } else {
446 aio_workq_unlock(queue);
447 }
448 }
449
450 return false;
451}
452
453static void
454aio_workq_lock_spin(aio_workq_t wq)
455{
456 lck_spin_lock(aio_workq_lock(wq));
457}
458
459static void
460aio_workq_unlock(aio_workq_t wq)
461{
462 lck_spin_unlock(aio_workq_lock(wq));
463}
464
465static lck_spin_t*
466aio_workq_lock(aio_workq_t wq)
467{
468 return &wq->aioq_lock;
469}
470
471/*
472 * aio_cancel - attempt to cancel one or more async IO requests currently
473 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
474 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
475 * is NULL then all outstanding async IO request for the given file
476 * descriptor are cancelled (if possible).
477 */
478int
479aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval)
480{
481 struct user_aiocb my_aiocb;
482 int result;
483
484 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_START,
485 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
486
487 /* quick check to see if there are any async IO requests queued up */
488 if (!aio_has_any_work()) {
489 result = 0;
490 *retval = AIO_ALLDONE;
491 goto ExitRoutine;
492 }
493
494 *retval = -1;
495 if (uap->aiocbp != USER_ADDR_NULL) {
496 if (proc_is64bit(p)) {
497 struct user64_aiocb aiocb64;
498
499 result = copyin(uap->aiocbp, &aiocb64, sizeof(aiocb64));
500 if (result == 0) {
501 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
502 }
503 } else {
504 struct user32_aiocb aiocb32;
505
506 result = copyin(uap->aiocbp, &aiocb32, sizeof(aiocb32));
507 if (result == 0) {
508 do_munge_aiocb_user32_to_user(&aiocb32, &my_aiocb);
509 }
510 }
511
512 if (result != 0) {
513 result = EAGAIN;
514 goto ExitRoutine;
515 }
516
517 /* NOTE - POSIX standard says a mismatch between the file */
518 /* descriptor passed in and the file descriptor embedded in */
519 /* the aiocb causes unspecified results. We return EBADF in */
520 /* that situation. */
521 if (uap->fd != my_aiocb.aio_fildes) {
522 result = EBADF;
523 goto ExitRoutine;
524 }
525 }
526
527 aio_proc_lock(p);
528 result = do_aio_cancel_locked(p, uap->fd, uap->aiocbp, 0);
529 ASSERT_AIO_PROC_LOCK_OWNED(p);
530 aio_proc_unlock(p);
531
532 if (result != -1) {
533 *retval = result;
534 result = 0;
535 goto ExitRoutine;
536 }
537
538 result = EBADF;
539
540ExitRoutine:
541 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_END,
542 VM_KERNEL_ADDRPERM(p), uap->aiocbp, result, 0, 0);
543
544 return result;
545}
546
547
548/*
549 * _aio_close - internal function used to clean up async IO requests for
550 * a file descriptor that is closing.
551 * THIS MAY BLOCK.
552 */
553__private_extern__ void
554_aio_close(proc_t p, int fd)
555{
556 int error;
557
558 /* quick check to see if there are any async IO requests queued up */
559 if (!aio_has_any_work()) {
560 return;
561 }
562
563 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_START,
564 VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
565
566 /* cancel all async IO requests on our todo queues for this file descriptor */
567 aio_proc_lock(p);
568 error = do_aio_cancel_locked(p, fd, USER_ADDR_NULL, AIO_CLOSE_WAIT);
569 ASSERT_AIO_PROC_LOCK_OWNED(p);
570 if (error == AIO_NOTCANCELED) {
571 /*
572 * AIO_NOTCANCELED is returned when we find an aio request for this process
573 * and file descriptor on the active async IO queue. Active requests cannot
574 * be cancelled so we must wait for them to complete. We will get a special
575 * wake up call on our channel used to sleep for ALL active requests to
576 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
577 * when we must wait for all active aio requests.
578 */
579
580 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep) | DBG_FUNC_NONE,
581 VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
582
583 while (aio_proc_has_active_requests_for_file(p, fd)) {
584 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0);
585 }
586 }
587
588 aio_proc_unlock(p);
589
590 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_END,
591 VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
592}
593
594
595/*
596 * aio_error - return the error status associated with the async IO
597 * request referred to by uap->aiocbp. The error status is the errno
598 * value that would be set by the corresponding IO request (read, wrtie,
599 * fdatasync, or sync).
600 */
601int
602aio_error(proc_t p, struct aio_error_args *uap, int *retval)
603{
604 aio_workq_entry *entryp;
605 int error;
606
607 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_START,
608 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
609
610 /* see if there are any aios to check */
611 if (!aio_has_any_work()) {
612 return EINVAL;
613 }
614
615 aio_proc_lock(p);
616
617 /* look for a match on our queue of async IO requests that have completed */
618 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
619 if (entryp->uaiocbp == uap->aiocbp) {
620 ASSERT_AIO_FROM_PROC(entryp, p);
621
622 *retval = entryp->errorval;
623 error = 0;
624
625 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val) | DBG_FUNC_NONE,
626 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
627 goto ExitRoutine;
628 }
629 }
630
631 /* look for a match on our queue of active async IO requests */
632 TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
633 if (entryp->uaiocbp == uap->aiocbp) {
634 ASSERT_AIO_FROM_PROC(entryp, p);
635 *retval = EINPROGRESS;
636 error = 0;
637 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq) | DBG_FUNC_NONE,
638 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
639 goto ExitRoutine;
640 }
641 }
642
643 error = EINVAL;
644
645ExitRoutine:
646 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_END,
647 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
648 aio_proc_unlock(p);
649
650 return error;
651}
652
653
654/*
655 * aio_fsync - asynchronously force all IO operations associated
656 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
657 * queued at the time of the call to the synchronized completion state.
658 * NOTE - we do not support op O_DSYNC at this point since we do not support the
659 * fdatasync() call.
660 */
661int
662aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval)
663{
664 aio_entry_flags_t fsync_kind;
665 int error;
666
667 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_START,
668 VM_KERNEL_ADDRPERM(p), uap->aiocbp, uap->op, 0, 0);
669
670 *retval = 0;
671 /* 0 := O_SYNC for binary backward compatibility with Panther */
672 if (uap->op == O_SYNC || uap->op == 0) {
673 fsync_kind = AIO_FSYNC;
674 } else if (uap->op == O_DSYNC) {
675 fsync_kind = AIO_DSYNC;
676 } else {
677 *retval = -1;
678 error = EINVAL;
679 goto ExitRoutine;
680 }
681
682 error = aio_queue_async_request(p, uap->aiocbp, fsync_kind);
683 if (error != 0) {
684 *retval = -1;
685 }
686
687ExitRoutine:
688 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_END,
689 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
690
691 return error;
692}
693
694
695/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
696 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
697 * (uap->aiocbp->aio_buf).
698 */
699int
700aio_read(proc_t p, struct aio_read_args *uap, int *retval)
701{
702 int error;
703
704 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_START,
705 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
706
707 *retval = 0;
708
709 error = aio_queue_async_request(p, uap->aiocbp, AIO_READ);
710 if (error != 0) {
711 *retval = -1;
712 }
713
714 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_END,
715 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
716
717 return error;
718}
719
720
721/*
722 * aio_return - return the return status associated with the async IO
723 * request referred to by uap->aiocbp. The return status is the value
724 * that would be returned by corresponding IO request (read, write,
725 * fdatasync, or sync). This is where we release kernel resources
726 * held for async IO call associated with the given aiocb pointer.
727 */
728int
729aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval)
730{
731 aio_workq_entry *entryp;
732 int error = EINVAL;
733
734 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_START,
735 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
736
737 /* See if there are any entries to check */
738 if (!aio_has_any_work()) {
739 goto ExitRoutine;
740 }
741
742 aio_proc_lock(p);
743 *retval = 0;
744
745 /* look for a match on our queue of async IO requests that have completed */
746 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
747 ASSERT_AIO_FROM_PROC(entryp, p);
748 if (entryp->uaiocbp == uap->aiocbp) {
749 /* Done and valid for aio_return(), pull it off the list */
750 aio_proc_remove_done_locked(p, entryp);
751
752 *retval = entryp->returnval;
753 error = 0;
754 aio_proc_unlock(p);
755
756 aio_entry_unref(entryp);
757
758 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val) | DBG_FUNC_NONE,
759 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
760 goto ExitRoutine;
761 }
762 }
763
764 /* look for a match on our queue of active async IO requests */
765 TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
766 ASSERT_AIO_FROM_PROC(entryp, p);
767 if (entryp->uaiocbp == uap->aiocbp) {
768 error = EINPROGRESS;
769 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq) | DBG_FUNC_NONE,
770 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
771 break;
772 }
773 }
774
775 aio_proc_unlock(p);
776
777ExitRoutine:
778 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_END,
779 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
780
781 return error;
782}
783
784
785/*
786 * _aio_exec - internal function used to clean up async IO requests for
787 * a process that is going away due to exec(). We cancel any async IOs
788 * we can and wait for those already active. We also disable signaling
789 * for cancelled or active aio requests that complete.
790 * This routine MAY block!
791 */
792__private_extern__ void
793_aio_exec(proc_t p)
794{
795 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_START,
796 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
797
798 _aio_exit(p);
799
800 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_END,
801 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
802}
803
804
805/*
806 * _aio_exit - internal function used to clean up async IO requests for
807 * a process that is terminating (via exit() or exec()). We cancel any async IOs
808 * we can and wait for those already active. We also disable signaling
809 * for cancelled or active aio requests that complete. This routine MAY block!
810 */
811__private_extern__ void
812_aio_exit(proc_t p)
813{
814 TAILQ_HEAD(, aio_workq_entry) tofree = TAILQ_HEAD_INITIALIZER(tofree);
815 aio_workq_entry *entryp, *tmp;
816 int error;
817
818 /* quick check to see if there are any async IO requests queued up */
819 if (!aio_has_any_work()) {
820 return;
821 }
822
823 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_START,
824 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
825
826 aio_proc_lock(p);
827
828 /*
829 * cancel async IO requests on the todo work queue and wait for those
830 * already active to complete.
831 */
832 error = do_aio_cancel_locked(p, -1, USER_ADDR_NULL, AIO_EXIT_WAIT);
833 ASSERT_AIO_PROC_LOCK_OWNED(p);
834 if (error == AIO_NOTCANCELED) {
835 /*
836 * AIO_NOTCANCELED is returned when we find an aio request for this process
837 * on the active async IO queue. Active requests cannot be cancelled so we
838 * must wait for them to complete. We will get a special wake up call on
839 * our channel used to sleep for ALL active requests to complete. This sleep
840 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
841 * active aio requests.
842 */
843
844 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep) | DBG_FUNC_NONE,
845 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
846
847 while (aio_has_active_requests_for_process(p)) {
848 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0);
849 }
850 }
851
852 assert(!aio_has_active_requests_for_process(p));
853
854 /* release all aio resources used by this process */
855 TAILQ_FOREACH_SAFE(entryp, &p->p_aio_doneq, aio_proc_link, tmp) {
856 ASSERT_AIO_FROM_PROC(entryp, p);
857
858 aio_proc_remove_done_locked(p, entryp);
859 TAILQ_INSERT_TAIL(&tofree, entryp, aio_proc_link);
860 }
861
862 aio_proc_unlock(p);
863
864 /* free all the entries outside of the aio_proc_lock() */
865 TAILQ_FOREACH_SAFE(entryp, &tofree, aio_proc_link, tmp) {
866 entryp->aio_proc_link.tqe_prev = NULL;
867 aio_entry_unref(entryp);
868 }
869
870 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_END,
871 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
872}
873
874
875static bool
876should_cancel(aio_workq_entry *entryp, int fd, user_addr_t aiocbp,
877 aio_entry_flags_t reason)
878{
879 if (reason & AIO_EXIT_WAIT) {
880 /* caller is _aio_exit() */
881 return true;
882 }
883 if (fd != entryp->aiocb.aio_fildes) {
884 /* not the file we're looking for */
885 return false;
886 }
887 /*
888 * aio_cancel() or _aio_close() cancel
889 * everything for a given fd when aiocbp is NULL
890 */
891 return aiocbp == USER_ADDR_NULL || entryp->uaiocbp == aiocbp;
892}
893
894/*
895 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
896 * aio_cancel, close, and at exit.
897 * There are three modes of operation: 1) cancel all async IOs for a process -
898 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
899 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
900 * aiocbp.
901 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
902 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
903 * target async IO requests, and AIO_ALLDONE if all target async IO requests
904 * were already complete.
905 * WARNING - do not deference aiocbp in this routine, it may point to user
906 * land data that has not been copied in (when called from aio_cancel())
907 *
908 * Called with proc locked, and returns the same way.
909 */
910static int
911do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
912 aio_entry_flags_t reason)
913{
914 bool multiple_matches = (aiocbp == USER_ADDR_NULL);
915 aio_workq_entry *entryp, *tmp;
916 int result;
917
918 ASSERT_AIO_PROC_LOCK_OWNED(p);
919
920 /* look for a match on our queue of async todo work. */
921again:
922 result = -1;
923 TAILQ_FOREACH_SAFE(entryp, &p->p_aio_activeq, aio_proc_link, tmp) {
924 ASSERT_AIO_FROM_PROC(entryp, p);
925
926 if (!should_cancel(entryp, fd, aiocbp, reason)) {
927 continue;
928 }
929
930 if (reason) {
931 /* mark the entry as blocking close or exit/exec */
932 entryp->flags |= reason;
933 if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
934 panic("Close and exit flags set at the same time\n");
935 }
936 }
937
938 /* Can only be cancelled if it's still on a work queue */
939 if (aio_entry_try_workq_remove(entryp)) {
940 entryp->errorval = ECANCELED;
941 entryp->returnval = -1;
942
943 /* Now it's officially cancelled. Do the completion */
944 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) | DBG_FUNC_NONE,
945 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
946 fd, 0, 0);
947 do_aio_completion_and_unlock(p, entryp);
948
949 aio_proc_lock(p);
950
951 if (multiple_matches) {
952 /*
953 * Restart from the head of the proc active queue since it
954 * may have been changed while we were away doing completion
955 * processing.
956 *
957 * Note that if we found an uncancellable AIO before, we will
958 * either find it again or discover that it's been completed,
959 * so resetting the result will not cause us to return success
960 * despite outstanding AIOs.
961 */
962 goto again;
963 }
964
965 return AIO_CANCELED;
966 }
967
968 /*
969 * It's been taken off the active queue already, i.e. is in flight.
970 * All we can do is ask for notification.
971 */
972 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq) | DBG_FUNC_NONE,
973 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
974 fd, 0, 0);
975
976 result = AIO_NOTCANCELED;
977 if (!multiple_matches) {
978 return result;
979 }
980 }
981
982 /*
983 * if we didn't find any matches on the todo or active queues then look for a
984 * match on our queue of async IO requests that have completed and if found
985 * return AIO_ALLDONE result.
986 *
987 * Proc AIO lock is still held.
988 */
989 if (result == -1) {
990 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
991 ASSERT_AIO_FROM_PROC(entryp, p);
992 if (should_cancel(entryp, fd, aiocbp, reason)) {
993 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq) | DBG_FUNC_NONE,
994 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
995 fd, 0, 0);
996
997 result = AIO_ALLDONE;
998 if (!multiple_matches) {
999 return result;
1000 }
1001 }
1002 }
1003 }
1004
1005 return result;
1006}
1007
1008
1009/*
1010 * aio_suspend - suspend the calling thread until at least one of the async
1011 * IO operations referenced by uap->aiocblist has completed, until a signal
1012 * interrupts the function, or uap->timeoutp time interval (optional) has
1013 * passed.
1014 * Returns 0 if one or more async IOs have completed else -1 and errno is
1015 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1016 * woke us up.
1017 */
1018int
1019aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval)
1020{
1021 __pthread_testcancel(1);
1022 return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
1023}
1024
1025
1026int
1027aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval)
1028{
1029 int error;
1030 int i;
1031 uint64_t abstime;
1032 struct user_timespec ts;
1033 aio_workq_entry *entryp;
1034 user_addr_t *aiocbpp;
1035 size_t aiocbpp_size;
1036
1037 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_START,
1038 VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1039
1040 *retval = -1;
1041 abstime = 0;
1042 aiocbpp = NULL;
1043
1044 if (!aio_has_any_work()) {
1045 error = EINVAL;
1046 goto ExitThisRoutine;
1047 }
1048
1049 if (uap->nent < 1 || uap->nent > aio_max_requests_per_process ||
1050 os_mul_overflow(sizeof(user_addr_t), uap->nent, &aiocbpp_size)) {
1051 error = EINVAL;
1052 goto ExitThisRoutine;
1053 }
1054
1055 if (uap->timeoutp != USER_ADDR_NULL) {
1056 if (proc_is64bit(p)) {
1057 struct user64_timespec temp;
1058 error = copyin(uap->timeoutp, &temp, sizeof(temp));
1059 if (error == 0) {
1060 ts.tv_sec = (user_time_t)temp.tv_sec;
1061 ts.tv_nsec = (user_long_t)temp.tv_nsec;
1062 }
1063 } else {
1064 struct user32_timespec temp;
1065 error = copyin(uap->timeoutp, &temp, sizeof(temp));
1066 if (error == 0) {
1067 ts.tv_sec = temp.tv_sec;
1068 ts.tv_nsec = temp.tv_nsec;
1069 }
1070 }
1071 if (error != 0) {
1072 error = EAGAIN;
1073 goto ExitThisRoutine;
1074 }
1075
1076 if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
1077 error = EINVAL;
1078 goto ExitThisRoutine;
1079 }
1080
1081 nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1082 &abstime);
1083 clock_absolutetime_interval_to_deadline(abstime, &abstime);
1084 }
1085
1086 aiocbpp = kheap_alloc(KHEAP_TEMP, aiocbpp_size, Z_WAITOK);
1087 if (aiocbpp == NULL || aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1088 error = EAGAIN;
1089 goto ExitThisRoutine;
1090 }
1091
1092 /* check list of aio requests to see if any have completed */
1093check_for_our_aiocbp:
1094 aio_proc_lock_spin(p);
1095 for (i = 0; i < uap->nent; i++) {
1096 user_addr_t aiocbp;
1097
1098 /* NULL elements are legal so check for 'em */
1099 aiocbp = *(aiocbpp + i);
1100 if (aiocbp == USER_ADDR_NULL) {
1101 continue;
1102 }
1103
1104 /* return immediately if any aio request in the list is done */
1105 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1106 ASSERT_AIO_FROM_PROC(entryp, p);
1107 if (entryp->uaiocbp == aiocbp) {
1108 aio_proc_unlock(p);
1109 *retval = 0;
1110 error = 0;
1111 goto ExitThisRoutine;
1112 }
1113 }
1114 }
1115
1116 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep) | DBG_FUNC_NONE,
1117 VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1118
1119 /*
1120 * wait for an async IO to complete or a signal fires or timeout expires.
1121 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1122 * interrupts us. If an async IO completes before a signal fires or our
1123 * timeout expires, we get a wakeup call from aio_work_thread().
1124 */
1125
1126 error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p),
1127 PCATCH | PWAIT | PDROP, "aio_suspend", abstime);
1128 if (error == 0) {
1129 /*
1130 * got our wakeup call from aio_work_thread().
1131 * Since we can get a wakeup on this channel from another thread in the
1132 * same process we head back up to make sure this is for the correct aiocbp.
1133 * If it is the correct aiocbp we will return from where we do the check
1134 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1135 * else we will fall out and just sleep again.
1136 */
1137 goto check_for_our_aiocbp;
1138 } else if (error == EWOULDBLOCK) {
1139 /* our timeout expired */
1140 error = EAGAIN;
1141 } else {
1142 /* we were interrupted */
1143 error = EINTR;
1144 }
1145
1146ExitThisRoutine:
1147 if (aiocbpp != NULL) {
1148 kheap_free(KHEAP_TEMP, aiocbpp, aiocbpp_size);
1149 }
1150
1151 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_END,
1152 VM_KERNEL_ADDRPERM(p), uap->nent, error, 0, 0);
1153
1154 return error;
1155}
1156
1157
1158/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1159 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1160 * (uap->aiocbp->aio_buf).
1161 */
1162
1163int
1164aio_write(proc_t p, struct aio_write_args *uap, int *retval __unused)
1165{
1166 int error;
1167
1168 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_START,
1169 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
1170
1171 error = aio_queue_async_request(p, uap->aiocbp, AIO_WRITE);
1172
1173 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_END,
1174 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
1175
1176 return error;
1177}
1178
1179
1180static int
1181aio_copy_in_list(proc_t procp, user_addr_t aiocblist, user_addr_t *aiocbpp,
1182 int nent)
1183{
1184 int result;
1185
1186 /* copyin our aiocb pointers from list */
1187 result = copyin(aiocblist, aiocbpp,
1188 proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1189 : (nent * sizeof(user32_addr_t)));
1190 if (result) {
1191 return result;
1192 }
1193
1194 /*
1195 * We depend on a list of user_addr_t's so we need to
1196 * munge and expand when these pointers came from a
1197 * 32-bit process
1198 */
1199 if (!proc_is64bit(procp)) {
1200 /* copy from last to first to deal with overlap */
1201 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1202 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1203
1204 for (int i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1205 *my_addrp = (user_addr_t) (*my_ptrp);
1206 }
1207 }
1208
1209 return 0;
1210}
1211
1212
1213static int
1214aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1215{
1216 int result = 0;
1217
1218 if (sigp == USER_ADDR_NULL) {
1219 goto out;
1220 }
1221
1222 /*
1223 * We need to munge aio_sigevent since it contains pointers.
1224 * Since we do not know if sigev_value is an int or a ptr we do
1225 * NOT cast the ptr to a user_addr_t. This means if we send
1226 * this info back to user space we need to remember sigev_value
1227 * was not expanded for the 32-bit case.
1228 *
1229 * Notes: This does NOT affect us since we don't support
1230 * sigev_value yet in the aio context.
1231 */
1232 if (proc_is64bit(procp)) {
1233#if __LP64__
1234 struct user64_sigevent sigevent64;
1235
1236 result = copyin(sigp, &sigevent64, sizeof(sigevent64));
1237 if (result == 0) {
1238 sigev->sigev_notify = sigevent64.sigev_notify;
1239 sigev->sigev_signo = sigevent64.sigev_signo;
1240 sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1241 sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1242 sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1243 }
1244#else
1245 panic("64bit process on 32bit kernel is not supported");
1246#endif
1247 } else {
1248 struct user32_sigevent sigevent32;
1249
1250 result = copyin(sigp, &sigevent32, sizeof(sigevent32));
1251 if (result == 0) {
1252 sigev->sigev_notify = sigevent32.sigev_notify;
1253 sigev->sigev_signo = sigevent32.sigev_signo;
1254 sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1255 sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1256 sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1257 }
1258 }
1259
1260 if (result != 0) {
1261 result = EAGAIN;
1262 }
1263
1264out:
1265 return result;
1266}
1267
1268/*
1269 * validate user_sigevent. at this point we only support
1270 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
1271 * sigev_value, sigev_notify_function, and sigev_notify_attributes
1272 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
1273 * with no [RTS] (RalTime Signal) option group support.
1274 */
1275static int
1276aio_sigev_validate(const struct user_sigevent *sigev)
1277{
1278 switch (sigev->sigev_notify) {
1279 case SIGEV_SIGNAL:
1280 {
1281 int signum;
1282
1283 /* make sure we have a valid signal number */
1284 signum = sigev->sigev_signo;
1285 if (signum <= 0 || signum >= NSIG ||
1286 signum == SIGKILL || signum == SIGSTOP) {
1287 return EINVAL;
1288 }
1289 }
1290 break;
1291
1292 case SIGEV_NONE:
1293 break;
1294
1295 case SIGEV_THREAD:
1296 /* Unsupported [RTS] */
1297
1298 default:
1299 return EINVAL;
1300 }
1301
1302 return 0;
1303}
1304
1305
1306/*
1307 * aio_try_enqueue_work_locked
1308 *
1309 * Queue up the entry on the aio asynchronous work queue in priority order
1310 * based on the relative priority of the request. We calculate the relative
1311 * priority using the nice value of the caller and the value
1312 *
1313 * Parameters: procp Process queueing the I/O
1314 * entryp The work queue entry being queued
1315 * leader The work leader if any
1316 *
1317 * Returns: Wether the enqueue was successful
1318 *
1319 * Notes: This function is used for both lio_listio and aio
1320 *
1321 * XXX: At some point, we may have to consider thread priority
1322 * rather than process priority, but we don't maintain the
1323 * adjusted priority for threads the POSIX way.
1324 *
1325 * Called with proc locked.
1326 */
1327static bool
1328aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp,
1329 aio_workq_entry *leader)
1330{
1331 aio_workq_t queue = aio_entry_workq(entryp);
1332
1333 ASSERT_AIO_PROC_LOCK_OWNED(procp);
1334
1335 /* Onto proc queue */
1336 if (!aio_try_proc_insert_active_locked(procp, entryp)) {
1337 return false;
1338 }
1339
1340 if (leader) {
1341 aio_entry_ref(leader); /* consumed in do_aio_completion_and_unlock */
1342 leader->lio_pending++;
1343 entryp->lio_leader = leader;
1344 }
1345
1346 /* And work queue */
1347 aio_entry_ref(entryp); /* consumed in do_aio_completion_and_unlock */
1348 aio_workq_lock_spin(queue);
1349 aio_workq_add_entry_locked(queue, entryp);
1350 waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1351 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
1352 aio_workq_unlock(queue);
1353
1354 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_START,
1355 VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1356 entryp->flags, entryp->aiocb.aio_fildes, 0);
1357 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_END,
1358 entryp->aiocb.aio_offset, 0, entryp->aiocb.aio_nbytes, 0, 0);
1359 return true;
1360}
1361
1362
1363/*
1364 * lio_listio - initiate a list of IO requests. We process the list of
1365 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1366 * (mode == LIO_NOWAIT).
1367 *
1368 * The caller gets error and return status for each aiocb in the list
1369 * via aio_error and aio_return. We must keep completed requests until
1370 * released by the aio_return call.
1371 */
1372int
1373lio_listio(proc_t p, struct lio_listio_args *uap, int *retval __unused)
1374{
1375 aio_workq_entry *entries[AIO_LISTIO_MAX] = { };
1376 user_addr_t aiocbpp[AIO_LISTIO_MAX];
1377 struct user_sigevent aiosigev = { };
1378 int result = 0;
1379 int lio_count = 0;
1380
1381 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_START,
1382 VM_KERNEL_ADDRPERM(p), uap->nent, uap->mode, 0, 0);
1383
1384 if (!(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT)) {
1385 result = EINVAL;
1386 goto ExitRoutine;
1387 }
1388
1389 if (uap->nent < 1 || uap->nent > AIO_LISTIO_MAX) {
1390 result = EINVAL;
1391 goto ExitRoutine;
1392 }
1393
1394 /*
1395 * Use sigevent passed in to lio_listio for each of our calls, but
1396 * only do completion notification after the last request completes.
1397 */
1398 if (uap->sigp != USER_ADDR_NULL) {
1399 result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1400 if (result) {
1401 goto ExitRoutine;
1402 }
1403 result = aio_sigev_validate(&aiosigev);
1404 if (result) {
1405 goto ExitRoutine;
1406 }
1407 }
1408
1409 if (aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1410 result = EAGAIN;
1411 goto ExitRoutine;
1412 }
1413
1414 /*
1415 * allocate/parse all entries
1416 */
1417 for (int i = 0; i < uap->nent; i++) {
1418 aio_workq_entry *entryp;
1419
1420 /* NULL elements are legal so check for 'em */
1421 if (aiocbpp[i] == USER_ADDR_NULL) {
1422 continue;
1423 }
1424
1425 entryp = aio_create_queue_entry(p, aiocbpp[i], AIO_LIO);
1426 if (entryp == NULL) {
1427 result = EAGAIN;
1428 goto ExitRoutine;
1429 }
1430
1431 /*
1432 * This refcount is cleaned up on exit if the entry
1433 * isn't submitted
1434 */
1435 entries[lio_count++] = entryp;
1436 if (uap->mode == LIO_NOWAIT) {
1437 /* Set signal hander, if any */
1438 entryp->aiocb.aio_sigevent = aiosigev;
1439 }
1440 }
1441
1442 if (lio_count == 0) {
1443 /* There's nothing to submit */
1444 goto ExitRoutine;
1445 }
1446
1447 /*
1448 * Past this point we're commited and will not bail out
1449 *
1450 * - keep a reference on the leader for LIO_WAIT
1451 * - perform the submissions and optionally wait
1452 */
1453
1454 aio_workq_entry *leader = entries[0];
1455 if (uap->mode == LIO_WAIT) {
1456 aio_entry_ref(leader); /* consumed below */
1457 }
1458
1459 aio_proc_lock_spin(p);
1460
1461 for (int i = 0; i < lio_count; i++) {
1462 if (aio_try_enqueue_work_locked(p, entries[i], leader)) {
1463 entries[i] = NULL; /* the entry was submitted */
1464 } else {
1465 result = EAGAIN;
1466 }
1467 }
1468
1469 if (uap->mode == LIO_WAIT && result == 0) {
1470 leader->flags |= AIO_LIO_WAIT;
1471
1472 while (leader->lio_pending) {
1473 /* If we were interrupted, fail out (even if all finished) */
1474 if (msleep(leader, aio_proc_mutex(p),
1475 PCATCH | PRIBIO | PSPIN, "lio_listio", 0) != 0) {
1476 result = EINTR;
1477 break;
1478 }
1479 }
1480
1481 leader->flags &= ~AIO_LIO_WAIT;
1482 }
1483
1484 aio_proc_unlock(p);
1485
1486 if (uap->mode == LIO_WAIT) {
1487 aio_entry_unref(leader);
1488 }
1489
1490ExitRoutine:
1491 /* Consume unsubmitted entries */
1492 for (int i = 0; i < lio_count; i++) {
1493 if (entries[i]) {
1494 aio_entry_unref(entries[i]);
1495 }
1496 }
1497
1498 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_END,
1499 VM_KERNEL_ADDRPERM(p), result, 0, 0, 0);
1500
1501 return result;
1502}
1503
1504
1505/*
1506 * aio worker thread. this is where all the real work gets done.
1507 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1508 * after new work is queued up.
1509 */
1510__attribute__((noreturn))
1511static void
1512aio_work_thread(void *arg __unused, wait_result_t wr __unused)
1513{
1514 aio_workq_entry *entryp;
1515 int error;
1516 vm_map_t currentmap;
1517 vm_map_t oldmap = VM_MAP_NULL;
1518 task_t oldaiotask = TASK_NULL;
1519 struct uthread *uthreadp = NULL;
1520 proc_t p = NULL;
1521
1522 for (;;) {
1523 /*
1524 * returns with the entry ref'ed.
1525 * sleeps until work is available.
1526 */
1527 entryp = aio_get_some_work();
1528 p = entryp->procp;
1529
1530 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_START,
1531 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1532 entryp->flags, 0, 0);
1533
1534 /*
1535 * Assume the target's address space identity for the duration
1536 * of the IO. Note: don't need to have the entryp locked,
1537 * because the proc and map don't change until it's freed.
1538 */
1539 currentmap = get_task_map((current_proc())->task);
1540 if (currentmap != entryp->aio_map) {
1541 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1542 oldaiotask = uthreadp->uu_aio_task;
1543 /*
1544 * workq entries at this stage cause _aio_exec() and _aio_exit() to
1545 * block until we hit `do_aio_completion_and_unlock()` below,
1546 * which means that it is safe to dereference p->task without
1547 * holding a lock or taking references.
1548 */
1549 uthreadp->uu_aio_task = p->task;
1550 oldmap = vm_map_switch(entryp->aio_map);
1551 }
1552
1553 if ((entryp->flags & AIO_READ) != 0) {
1554 error = do_aio_read(entryp);
1555 } else if ((entryp->flags & AIO_WRITE) != 0) {
1556 error = do_aio_write(entryp);
1557 } else if ((entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0) {
1558 error = do_aio_fsync(entryp);
1559 } else {
1560 error = EINVAL;
1561 }
1562
1563 /* Restore old map */
1564 if (currentmap != entryp->aio_map) {
1565 vm_map_switch(oldmap);
1566 uthreadp->uu_aio_task = oldaiotask;
1567 }
1568
1569 /* liberate unused map */
1570 vm_map_deallocate(entryp->aio_map);
1571 entryp->aio_map = VM_MAP_NULL;
1572
1573 KERNEL_DEBUG(SDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_END,
1574 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1575 entryp->errorval, entryp->returnval, 0);
1576
1577 /* we're done with the IO request so pop it off the active queue and */
1578 /* push it on the done queue */
1579 aio_proc_lock(p);
1580 entryp->errorval = error;
1581 do_aio_completion_and_unlock(p, entryp);
1582 }
1583}
1584
1585
1586/*
1587 * aio_get_some_work - get the next async IO request that is ready to be executed.
1588 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1589 * IO requests at the time the aio_fsync call came in have completed.
1590 * NOTE - AIO_LOCK must be held by caller
1591 */
1592static aio_workq_entry *
1593aio_get_some_work(void)
1594{
1595 aio_workq_entry *entryp = NULL;
1596 aio_workq_t queue = NULL;
1597
1598 /* Just one queue for the moment. In the future there will be many. */
1599 queue = &aio_anchor.aio_async_workqs[0];
1600 aio_workq_lock_spin(queue);
1601
1602 /*
1603 * Hold the queue lock.
1604 *
1605 * pop some work off the work queue and add to our active queue
1606 * Always start with the queue lock held.
1607 */
1608 while ((entryp = TAILQ_FIRST(&queue->aioq_entries))) {
1609 /*
1610 * Pull of of work queue. Once it's off, it can't be cancelled,
1611 * so we can take our ref once we drop the queue lock.
1612 */
1613
1614 aio_workq_remove_entry_locked(queue, entryp);
1615
1616 aio_workq_unlock(queue);
1617
1618 /*
1619 * Check if it's an fsync that must be delayed. No need to lock the entry;
1620 * that flag would have been set at initialization.
1621 */
1622 if ((entryp->flags & AIO_FSYNC) != 0) {
1623 /*
1624 * Check for unfinished operations on the same file
1625 * in this proc's queue.
1626 */
1627 aio_proc_lock_spin(entryp->procp);
1628 if (aio_delay_fsync_request(entryp)) {
1629 /* It needs to be delayed. Put it back on the end of the work queue */
1630 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) | DBG_FUNC_NONE,
1631 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1632 0, 0, 0);
1633
1634 aio_proc_unlock(entryp->procp);
1635
1636 aio_workq_lock_spin(queue);
1637 aio_workq_add_entry_locked(queue, entryp);
1638 continue;
1639 }
1640 aio_proc_unlock(entryp->procp);
1641 }
1642
1643 return entryp;
1644 }
1645
1646 /* We will wake up when someone enqueues something */
1647 waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
1648 aio_workq_unlock(queue);
1649 thread_block(aio_work_thread);
1650
1651 __builtin_unreachable();
1652}
1653
1654/*
1655 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1656 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1657 * not been completed.
1658 */
1659static boolean_t
1660aio_delay_fsync_request(aio_workq_entry *entryp)
1661{
1662 if (proc_in_teardown(entryp->procp)) {
1663 /*
1664 * we can't delay FSYNCS when in teardown as it will confuse _aio_exit,
1665 * if it was dequeued, then we must now commit to it
1666 */
1667 return FALSE;
1668 }
1669
1670 if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1671 return FALSE;
1672 }
1673
1674 return TRUE;
1675}
1676
1677static aio_workq_entry *
1678aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t flags)
1679{
1680 aio_workq_entry *entryp;
1681
1682 entryp = zalloc_flags(aio_workq_zonep, Z_WAITOK | Z_ZERO);
1683 entryp->procp = procp;
1684 entryp->uaiocbp = aiocbp;
1685 entryp->flags = flags;
1686 /* consumed in aio_return or _aio_exit */
1687 os_ref_init(&entryp->aio_refcount, &aio_refgrp);
1688
1689 if (proc_is64bit(procp)) {
1690 struct user64_aiocb aiocb64;
1691
1692 if (copyin(aiocbp, &aiocb64, sizeof(aiocb64)) != 0) {
1693 goto error_exit;
1694 }
1695 do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1696 } else {
1697 struct user32_aiocb aiocb32;
1698
1699 if (copyin(aiocbp, &aiocb32, sizeof(aiocb32)) != 0) {
1700 goto error_exit;
1701 }
1702 do_munge_aiocb_user32_to_user(&aiocb32, &entryp->aiocb);
1703 }
1704
1705 /* do some more validation on the aiocb and embedded file descriptor */
1706 if (aio_validate(procp, entryp) != 0) {
1707 goto error_exit;
1708 }
1709
1710 /* get a reference to the user land map in order to keep it around */
1711 entryp->aio_map = get_task_map(procp->task);
1712 vm_map_reference(entryp->aio_map);
1713
1714 /* get a reference on the current_thread, which is passed in vfs_context. */
1715 entryp->thread = current_thread();
1716 thread_reference(entryp->thread);
1717 return entryp;
1718
1719error_exit:
1720 zfree(aio_workq_zonep, entryp);
1721 return NULL;
1722}
1723
1724
1725/*
1726 * aio_queue_async_request - queue up an async IO request on our work queue then
1727 * wake up one of our worker threads to do the actual work. We get a reference
1728 * to our caller's user land map in order to keep it around while we are
1729 * processing the request.
1730 */
1731static int
1732aio_queue_async_request(proc_t procp, user_addr_t aiocbp,
1733 aio_entry_flags_t flags)
1734{
1735 aio_workq_entry *entryp;
1736 int result;
1737
1738 entryp = aio_create_queue_entry(procp, aiocbp, flags);
1739 if (entryp == NULL) {
1740 result = EAGAIN;
1741 goto error_noalloc;
1742 }
1743
1744 aio_proc_lock_spin(procp);
1745 if (!aio_try_enqueue_work_locked(procp, entryp, NULL)) {
1746 result = EAGAIN;
1747 goto error_exit;
1748 }
1749 aio_proc_unlock(procp);
1750 return 0;
1751
1752error_exit:
1753 /*
1754 * This entry has not been queued up so no worries about
1755 * unlocked state and aio_map
1756 */
1757 aio_proc_unlock(procp);
1758 aio_free_request(entryp);
1759error_noalloc:
1760 return result;
1761}
1762
1763
1764/*
1765 * aio_free_request - remove our reference on the user land map and
1766 * free the work queue entry resources. The entry is off all lists
1767 * and has zero refcount, so no one can have a pointer to it.
1768 */
1769static void
1770aio_free_request(aio_workq_entry *entryp)
1771{
1772 if (entryp->aio_proc_link.tqe_prev || entryp->aio_workq_link.tqe_prev) {
1773 panic("aio_workq_entry %p being freed while still enqueued", entryp);
1774 }
1775
1776 /* remove our reference to the user land map. */
1777 if (VM_MAP_NULL != entryp->aio_map) {
1778 vm_map_deallocate(entryp->aio_map);
1779 }
1780
1781 /* remove our reference to thread which enqueued the request */
1782 if (NULL != entryp->thread) {
1783 thread_deallocate(entryp->thread);
1784 }
1785
1786 zfree(aio_workq_zonep, entryp);
1787}
1788
1789
1790/*
1791 * aio_validate
1792 *
1793 * validate the aiocb passed in by one of the aio syscalls.
1794 */
1795static int
1796aio_validate(proc_t p, aio_workq_entry *entryp)
1797{
1798 struct fileproc *fp;
1799 int flag;
1800 int result;
1801
1802 result = 0;
1803
1804 if ((entryp->flags & AIO_LIO) != 0) {
1805 if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
1806 entryp->flags |= AIO_READ;
1807 } else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
1808 entryp->flags |= AIO_WRITE;
1809 } else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
1810 return 0;
1811 } else {
1812 return EINVAL;
1813 }
1814 }
1815
1816 flag = FREAD;
1817 if ((entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0) {
1818 flag = FWRITE;
1819 }
1820
1821 if ((entryp->flags & (AIO_READ | AIO_WRITE)) != 0) {
1822 if (entryp->aiocb.aio_nbytes > INT_MAX ||
1823 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1824 entryp->aiocb.aio_offset < 0) {
1825 return EINVAL;
1826 }
1827 }
1828
1829 result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
1830 if (result) {
1831 return result;
1832 }
1833
1834 /* validate the file descriptor and that the file was opened
1835 * for the appropriate read / write access.
1836 */
1837 proc_fdlock(p);
1838
1839 fp = fp_get_noref_locked(p, entryp->aiocb.aio_fildes);
1840 if (fp == NULL) {
1841 result = EBADF;
1842 } else if ((fp->fp_glob->fg_flag & flag) == 0) {
1843 /* we don't have read or write access */
1844 result = EBADF;
1845 } else if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_VNODE) {
1846 /* this is not a file */
1847 result = ESPIPE;
1848 } else {
1849 fp->fp_flags |= FP_AIOISSUED;
1850 }
1851
1852 proc_fdunlock(p);
1853
1854 return result;
1855}
1856
1857/*
1858 * do_aio_completion_and_unlock. Handle async IO completion.
1859 */
1860static void
1861do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp)
1862{
1863 aio_workq_entry *leader = entryp->lio_leader;
1864 int lio_pending = 0;
1865 bool do_signal = false;
1866
1867 ASSERT_AIO_PROC_LOCK_OWNED(p);
1868
1869 aio_proc_move_done_locked(p, entryp);
1870
1871 if (leader) {
1872 lio_pending = --leader->lio_pending;
1873 if (lio_pending < 0) {
1874 panic("lio_pending accounting mistake");
1875 }
1876 if (lio_pending == 0 && (leader->flags & AIO_LIO_WAIT)) {
1877 wakeup(leader);
1878 }
1879 entryp->lio_leader = NULL; /* no dangling pointers please */
1880 }
1881
1882 /*
1883 * need to handle case where a process is trying to exit, exec, or
1884 * close and is currently waiting for active aio requests to complete.
1885 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
1886 * other requests in the active queue for this process. If there are
1887 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
1888 * If there are some still active then do nothing - we only want to
1889 * wakeup when all active aio requests for the process are complete.
1890 */
1891 if (__improbable(entryp->flags & AIO_EXIT_WAIT)) {
1892 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1893 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1894 0, 0, 0);
1895
1896 if (!aio_has_active_requests_for_process(p)) {
1897 /*
1898 * no active aio requests for this process, continue exiting. In this
1899 * case, there should be no one else waiting ont he proc in AIO...
1900 */
1901 wakeup_one((caddr_t)&p->AIO_CLEANUP_SLEEP_CHAN);
1902
1903 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1904 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1905 0, 0, 0);
1906 }
1907 } else if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
1908 /*
1909 * If this was the last request in the group, or not part of
1910 * a group, and that a signal is desired, send one.
1911 */
1912 do_signal = (lio_pending == 0);
1913 }
1914
1915 if (__improbable(entryp->flags & AIO_CLOSE_WAIT)) {
1916 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1917 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1918 0, 0, 0);
1919
1920 if (!aio_proc_has_active_requests_for_file(p, entryp->aiocb.aio_fildes)) {
1921 /* Can't wakeup_one(); multiple closes might be in progress. */
1922 wakeup(&p->AIO_CLEANUP_SLEEP_CHAN);
1923
1924 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1925 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1926 0, 0, 0);
1927 }
1928 }
1929
1930 aio_proc_unlock(p);
1931
1932 if (do_signal) {
1933 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig) | DBG_FUNC_NONE,
1934 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1935 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0);
1936
1937 psignal(p, entryp->aiocb.aio_sigevent.sigev_signo);
1938 }
1939
1940 /*
1941 * A thread in aio_suspend() wants to known about completed IOs. If it checked
1942 * the done list before we moved our AIO there, then it already asserted its wait,
1943 * and we can wake it up without holding the lock. If it checked the list after
1944 * we did our move, then it already has seen the AIO that we moved. Herego, we
1945 * can do our wakeup without holding the lock.
1946 */
1947 wakeup(&p->AIO_SUSPEND_SLEEP_CHAN);
1948 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake) | DBG_FUNC_NONE,
1949 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), 0, 0, 0);
1950
1951 aio_entry_unref(entryp); /* see aio_try_enqueue_work_locked */
1952 if (leader) {
1953 aio_entry_unref(leader); /* see lio_listio */
1954 }
1955}
1956
1957
1958/*
1959 * do_aio_read
1960 */
1961static int
1962do_aio_read(aio_workq_entry *entryp)
1963{
1964 struct proc *p = entryp->procp;
1965 struct fileproc *fp;
1966 int error;
1967
1968 if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
1969 return error;
1970 }
1971
1972 if (fp->fp_glob->fg_flag & FREAD) {
1973 struct vfs_context context = {
1974 .vc_thread = entryp->thread, /* XXX */
1975 .vc_ucred = fp->fp_glob->fg_cred,
1976 };
1977
1978 error = dofileread(&context, fp,
1979 entryp->aiocb.aio_buf,
1980 entryp->aiocb.aio_nbytes,
1981 entryp->aiocb.aio_offset, FOF_OFFSET,
1982 &entryp->returnval);
1983 } else {
1984 error = EBADF;
1985 }
1986
1987 fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
1988 return error;
1989}
1990
1991
1992/*
1993 * do_aio_write
1994 */
1995static int
1996do_aio_write(aio_workq_entry *entryp)
1997{
1998 struct proc *p = entryp->procp;
1999 struct fileproc *fp;
2000 int error;
2001
2002 if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
2003 return error;
2004 }
2005
2006 if (fp->fp_glob->fg_flag & FWRITE) {
2007 struct vfs_context context = {
2008 .vc_thread = entryp->thread, /* XXX */
2009 .vc_ucred = fp->fp_glob->fg_cred,
2010 };
2011 int flags = FOF_PCRED;
2012
2013 if ((fp->fp_glob->fg_flag & O_APPEND) == 0) {
2014 flags |= FOF_OFFSET;
2015 }
2016
2017 /* NB: tell dofilewrite the offset, and to use the proc cred */
2018 error = dofilewrite(&context,
2019 fp,
2020 entryp->aiocb.aio_buf,
2021 entryp->aiocb.aio_nbytes,
2022 entryp->aiocb.aio_offset,
2023 flags,
2024 &entryp->returnval);
2025 } else {
2026 error = EBADF;
2027 }
2028
2029 fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2030 return error;
2031}
2032
2033
2034/*
2035 * aio_has_active_requests_for_process - return whether the process has active
2036 * requests pending.
2037 */
2038static bool
2039aio_has_active_requests_for_process(proc_t procp)
2040{
2041 return !TAILQ_EMPTY(&procp->p_aio_activeq);
2042}
2043
2044/*
2045 * Called with the proc locked.
2046 */
2047static bool
2048aio_proc_has_active_requests_for_file(proc_t procp, int fd)
2049{
2050 aio_workq_entry *entryp;
2051
2052 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2053 if (entryp->aiocb.aio_fildes == fd) {
2054 return true;
2055 }
2056 }
2057
2058 return false;
2059}
2060
2061
2062/*
2063 * do_aio_fsync
2064 */
2065static int
2066do_aio_fsync(aio_workq_entry *entryp)
2067{
2068 struct proc *p = entryp->procp;
2069 struct vnode *vp;
2070 struct fileproc *fp;
2071 int sync_flag;
2072 int error;
2073
2074 /*
2075 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2076 *
2077 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2078 * to mark for update the metadata not strictly necessary for data
2079 * retrieval, rather than forcing it to disk.
2080 *
2081 * If AIO_FSYNC is set, we have to also wait for metadata not really
2082 * necessary to data retrival are committed to stable storage (e.g.
2083 * atime, mtime, ctime, etc.).
2084 *
2085 * Metadata necessary for data retrieval ust be committed to stable
2086 * storage in either case (file length, etc.).
2087 */
2088 if (entryp->flags & AIO_FSYNC) {
2089 sync_flag = MNT_WAIT;
2090 } else {
2091 sync_flag = MNT_DWAIT;
2092 }
2093
2094 error = fp_get_ftype(p, entryp->aiocb.aio_fildes, DTYPE_VNODE, ENOTSUP, &fp);
2095 if (error != 0) {
2096 entryp->returnval = -1;
2097 return error;
2098 }
2099 vp = fp->fp_glob->fg_data;
2100
2101 if ((error = vnode_getwithref(vp)) == 0) {
2102 struct vfs_context context = {
2103 .vc_thread = entryp->thread, /* XXX */
2104 .vc_ucred = fp->fp_glob->fg_cred,
2105 };
2106
2107 error = VNOP_FSYNC(vp, sync_flag, &context);
2108
2109 (void)vnode_put(vp);
2110 } else {
2111 entryp->returnval = -1;
2112 }
2113
2114 fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2115 return error;
2116}
2117
2118
2119/*
2120 * is_already_queued - runs through our queues to see if the given
2121 * aiocbp / process is there. Returns TRUE if there is a match
2122 * on any of our aio queues.
2123 *
2124 * Called with proc aio lock held (can be held spin)
2125 */
2126static boolean_t
2127is_already_queued(proc_t procp, user_addr_t aiocbp)
2128{
2129 aio_workq_entry *entryp;
2130 boolean_t result;
2131
2132 result = FALSE;
2133
2134 /* look for matches on our queue of async IO requests that have completed */
2135 TAILQ_FOREACH(entryp, &procp->p_aio_doneq, aio_proc_link) {
2136 if (aiocbp == entryp->uaiocbp) {
2137 result = TRUE;
2138 goto ExitThisRoutine;
2139 }
2140 }
2141
2142 /* look for matches on our queue of active async IO requests */
2143 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2144 if (aiocbp == entryp->uaiocbp) {
2145 result = TRUE;
2146 goto ExitThisRoutine;
2147 }
2148 }
2149
2150ExitThisRoutine:
2151 return result;
2152}
2153
2154
2155/*
2156 * aio initialization
2157 */
2158__private_extern__ void
2159aio_init(void)
2160{
2161 for (int i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2162 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2163 }
2164
2165 _aio_create_worker_threads(aio_worker_threads);
2166}
2167
2168
2169/*
2170 * aio worker threads created here.
2171 */
2172__private_extern__ void
2173_aio_create_worker_threads(int num)
2174{
2175 int i;
2176
2177 /* create some worker threads to handle the async IO requests */
2178 for (i = 0; i < num; i++) {
2179 thread_t myThread;
2180
2181 if (KERN_SUCCESS != kernel_thread_start(aio_work_thread, NULL, &myThread)) {
2182 printf("%s - failed to create a work thread \n", __FUNCTION__);
2183 } else {
2184 thread_deallocate(myThread);
2185 }
2186 }
2187}
2188
2189/*
2190 * Return the current activation utask
2191 */
2192task_t
2193get_aiotask(void)
2194{
2195 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2196}
2197
2198
2199/*
2200 * In the case of an aiocb from a
2201 * 32-bit process we need to expand some longs and pointers to the correct
2202 * sizes in order to let downstream code always work on the same type of
2203 * aiocb (in our case that is a user_aiocb)
2204 */
2205static void
2206do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2207{
2208 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2209 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2210 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2211 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2212 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2213 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2214
2215 /* special case here. since we do not know if sigev_value is an */
2216 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2217 /* means if we send this info back to user space we need to remember */
2218 /* sigev_value was not expanded for the 32-bit case. */
2219 /* NOTE - this does NOT affect us since we don't support sigev_value */
2220 /* yet in the aio context. */
2221 //LP64
2222 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2223 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2224 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2225 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2226 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2227 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2228 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2229 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2230}
2231
2232/* Similar for 64-bit user process, so that we don't need to satisfy
2233 * the alignment constraints of the original user64_aiocb
2234 */
2235#if !__LP64__
2236__dead2
2237#endif
2238static void
2239do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2240{
2241#if __LP64__
2242 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2243 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2244 the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2245 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2246 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2247 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2248
2249 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2250 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2251 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2252 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2253 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2254 my_aiocbp->aio_sigevent.sigev_notify_function;
2255 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2256 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2257#else
2258#pragma unused(my_aiocbp, the_user_aiocbp)
2259 panic("64bit process on 32bit kernel is not supported");
2260#endif
2261}