bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29
  30 /*
  31  * todo:
  32  *              1) ramesh is looking into how to replace taking a reference on
  33  *                      the user's map (vm_map_reference()) since it is believed that
  34  *                      would not hold the process for us.
  35  *              2) david is looking into a way for us to set the priority of the
  36  *                      worker threads to match that of the user's thread when the
  37  *                      async IO was queued.
  38  */
  39
  40
  41 /*
  42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  43  */
  44
  45 #include <sys/systm.h>
  46 #include <sys/fcntl.h>
  47 #include <sys/file_internal.h>
  48 #include <sys/filedesc.h>
  49 #include <sys/kernel.h>
  50 #include <sys/vnode_internal.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount_internal.h>
  53 #include <sys/param.h>
  54 #include <sys/proc_internal.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/unistd.h>
  57 #include <sys/user.h>
  58
  59 #include <sys/aio_kern.h>
  60 #include <sys/sysproto.h>
  61
  62 #include <machine/limits.h>
  63
  64 #include <mach/mach_types.h>
  65 #include <kern/kern_types.h>
  66 #include <kern/waitq.h>
  67 #include <kern/zalloc.h>
  68 #include <kern/task.h>
  69 #include <kern/sched_prim.h>
  70
  71 #include <vm/vm_map.h>
  72
  73 #include <os/refcnt.h>
  74
  75 #include <sys/kdebug.h>
  76 #define AIO_work_queued                 1
  77 #define AIO_worker_wake                 2
  78 #define AIO_completion_sig              3
  79 #define AIO_completion_cleanup_wait     4
  80 #define AIO_completion_cleanup_wake     5
  81 #define AIO_completion_suspend_wake     6
  82 #define AIO_fsync_delay                 7
  83 #define AIO_cancel                      10
  84 #define AIO_cancel_async_workq          11
  85 #define AIO_cancel_sync_workq           12
  86 #define AIO_cancel_activeq              13
  87 #define AIO_cancel_doneq                14
  88 #define AIO_fsync                       20
  89 #define AIO_read                        30
  90 #define AIO_write                       40
  91 #define AIO_listio                      50
  92 #define AIO_error                       60
  93 #define AIO_error_val                   61
  94 #define AIO_error_activeq               62
  95 #define AIO_error_workq                 63
  96 #define AIO_return                      70
  97 #define AIO_return_val                  71
  98 #define AIO_return_activeq              72
  99 #define AIO_return_workq                73
 100 #define AIO_exec                        80
 101 #define AIO_exit                        90
 102 #define AIO_exit_sleep                  91
 103 #define AIO_close                       100
 104 #define AIO_close_sleep                 101
 105 #define AIO_suspend                     110
 106 #define AIO_suspend_sleep               111
 107 #define AIO_worker_thread               120
 108
 109 __options_decl(aio_entry_flags_t, uint32_t, {
 110         AIO_READ        = 0x00000001, /* a read */
 111         AIO_WRITE       = 0x00000002, /* a write */
 112         AIO_FSYNC       = 0x00000004, /* aio_fsync with op = O_SYNC */
 113         AIO_DSYNC       = 0x00000008, /* aio_fsync with op = O_DSYNC (not supported yet) */
 114         AIO_LIO         = 0x00000010, /* lio_listio generated IO */
 115         AIO_LIO_WAIT    = 0x00000020, /* lio_listio is waiting on the leader */
 116
 117         /*
 118          * These flags mean that this entry is blocking either:
 119          * - close (AIO_CLOSE_WAIT)
 120          * - exit or exec (AIO_EXIT_WAIT)
 121          *
 122          * These flags are mutually exclusive, and the AIO_EXIT_WAIT variant
 123          * will also neuter notifications in do_aio_completion_and_unlock().
 124          */
 125         AIO_CLOSE_WAIT  = 0x00004000,
 126         AIO_EXIT_WAIT   = 0x00008000,
 127 });
 128
 129 /*! @struct aio_workq_entry
 130  *
 131  * @discussion
 132  * This represents a piece of aio/lio work.
 133  *
 134  * The ownership rules go as follows:
 135  *
 136  * - the "proc" owns one refcount on the entry (from creation), while it is
 137  *   enqueued on the aio_activeq and then the aio_doneq.
 138  *
 139  *   either aio_return() (user read the status) or _aio_exit() (the process
 140  *   died) will dequeue the entry and consume this ref.
 141  *
 142  * - the async workqueue owns one refcount once the work is submitted,
 143  *   which is consumed in do_aio_completion_and_unlock().
 144  *
 145  *   This ref protects the entry for the the end of
 146  *   do_aio_completion_and_unlock() (when signal delivery happens).
 147  *
 148  * - lio_listio() for batches picks one of the entries to be the "leader"
 149  *   of the batch. Each work item will have a refcount on its leader
 150  *   so that the accounting of the batch completion can be done on the leader
 151  *   (to be able to decrement lio_pending).
 152  *
 153  *   This ref is consumed in do_aio_completion_and_unlock() as well.
 154  *
 155  * - lastly, in lio_listio() when the LIO_WAIT behavior is requested,
 156  *   an extra ref is taken in this syscall as it needs to keep accessing
 157  *   the leader "lio_pending" field until it hits 0.
 158  */
 159 struct aio_workq_entry {
 160         /* queue lock */
 161         TAILQ_ENTRY(aio_workq_entry)    aio_workq_link;
 162
 163         /* Proc lock */
 164         TAILQ_ENTRY(aio_workq_entry)    aio_proc_link;  /* p_aio_activeq or p_aio_doneq */
 165         user_ssize_t                    returnval;      /* return value from read / write request */
 166         errno_t                         errorval;       /* error value from read / write request */
 167         os_refcnt_t                     aio_refcount;
 168         aio_entry_flags_t               flags;
 169
 170         int                             lio_pending;    /* pending I/Os in lio group, only on leader */
 171         struct aio_workq_entry         *lio_leader;     /* pointer to the lio leader, can be self */
 172
 173         /* Initialized and never changed, safe to access */
 174         struct proc                    *procp;          /* user proc that queued this request */
 175         user_addr_t                     uaiocbp;        /* pointer passed in from user land */
 176         struct user_aiocb               aiocb;          /* copy of aiocb from user land */
 177         thread_t                        thread;         /* thread that queued this request */
 178
 179         /* Initialized, and possibly freed by aio_work_thread() or at free if cancelled */
 180         vm_map_t                        aio_map;        /* user land map we have a reference to */
 181 };
 182
 183 /*
 184  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 185  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 186  * (proc.aio_activeq) when one of our worker threads start the IO.
 187  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 188  * when the IO request completes.  The request remains on aio_doneq until
 189  * user process calls aio_return or the process exits, either way that is our
 190  * trigger to release aio resources.
 191  */
 192 typedef struct aio_workq   {
 193         TAILQ_HEAD(, aio_workq_entry)   aioq_entries;
 194         lck_spin_t                      aioq_lock;
 195         struct waitq                    aioq_waitq;
 196 } *aio_workq_t;
 197
 198 #define AIO_NUM_WORK_QUEUES 1
 199 struct aio_anchor_cb {
 200         os_atomic(int)          aio_total_count;        /* total extant entries */
 201
 202         /* Hash table of queues here */
 203         int                     aio_num_workqs;
 204         struct aio_workq        aio_async_workqs[AIO_NUM_WORK_QUEUES];
 205 };
 206 typedef struct aio_anchor_cb aio_anchor_cb;
 207
 208 /*
 209  * Notes on aio sleep / wake channels.
 210  * We currently pick a couple fields within the proc structure that will allow
 211  * us sleep channels that currently do not collide with any other kernel routines.
 212  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 213  */
 214 #define AIO_SUSPEND_SLEEP_CHAN  p_aio_activeq
 215 #define AIO_CLEANUP_SLEEP_CHAN  p_aio_total_count
 216
 217 #define ASSERT_AIO_FROM_PROC(aiop, theproc)     \
 218         if ((aiop)->procp != (theproc)) {       \
 219                 panic("AIO on a proc list that does not belong to that proc.\n"); \
 220         }
 221
 222 /*
 223  *  LOCAL PROTOTYPES
 224  */
 225 static void             aio_proc_lock(proc_t procp);
 226 static void             aio_proc_lock_spin(proc_t procp);
 227 static void             aio_proc_unlock(proc_t procp);
 228 static lck_mtx_t       *aio_proc_mutex(proc_t procp);
 229 static bool             aio_has_active_requests_for_process(proc_t procp);
 230 static bool             aio_proc_has_active_requests_for_file(proc_t procp, int fd);
 231 static boolean_t        is_already_queued(proc_t procp, user_addr_t aiocbp);
 232
 233 static aio_workq_t      aio_entry_workq(aio_workq_entry *entryp);
 234 static void             aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 235 static void             aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 236 static void             aio_entry_ref(aio_workq_entry *entryp);
 237 static void             aio_entry_unref(aio_workq_entry *entryp);
 238 static bool             aio_entry_try_workq_remove(aio_workq_entry *entryp);
 239 static boolean_t        aio_delay_fsync_request(aio_workq_entry *entryp);
 240 static void             aio_free_request(aio_workq_entry *entryp);
 241
 242 static void             aio_workq_init(aio_workq_t wq);
 243 static void             aio_workq_lock_spin(aio_workq_t wq);
 244 static void             aio_workq_unlock(aio_workq_t wq);
 245 static lck_spin_t      *aio_workq_lock(aio_workq_t wq);
 246
 247 static void             aio_work_thread(void *arg, wait_result_t wr);
 248 static aio_workq_entry *aio_get_some_work(void);
 249
 250 static int              aio_queue_async_request(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
 251 static int              aio_validate(proc_t, aio_workq_entry *entryp);
 252
 253 static int              do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, aio_entry_flags_t);
 254 static void             do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp);
 255 static int              do_aio_fsync(aio_workq_entry *entryp);
 256 static int              do_aio_read(aio_workq_entry *entryp);
 257 static int              do_aio_write(aio_workq_entry *entryp);
 258 static void             do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
 259 static void             do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
 260 static aio_workq_entry *aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
 261 static int              aio_copy_in_list(proc_t, user_addr_t, user_addr_t *, int);
 262
 263 #define ASSERT_AIO_PROC_LOCK_OWNED(p)   LCK_MTX_ASSERT(aio_proc_mutex(p), LCK_MTX_ASSERT_OWNED)
 264 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q)  LCK_SPIN_ASSERT(aio_workq_lock(q), LCK_ASSERT_OWNED)
 265
 266 /*
 267  *  EXTERNAL PROTOTYPES
 268  */
 269
 270 /* in ...bsd/kern/sys_generic.c */
 271 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
 272     user_addr_t bufp, user_size_t nbyte,
 273     off_t offset, int flags, user_ssize_t *retval);
 274 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 275     user_addr_t bufp, user_size_t nbyte, off_t offset,
 276     int flags, user_ssize_t *retval);
 277
 278 /*
 279  * aio external global variables.
 280  */
 281 extern int aio_max_requests;                    /* AIO_MAX - configurable */
 282 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 283 extern int aio_worker_threads;                  /* AIO_THREAD_COUNT - configurable */
 284
 285
 286 /*
 287  * aio static variables.
 288  */
 289 static aio_anchor_cb aio_anchor = {
 290         .aio_num_workqs = AIO_NUM_WORK_QUEUES,
 291 };
 292 os_refgrp_decl(static, aio_refgrp, "aio", NULL);
 293 static LCK_GRP_DECLARE(aio_proc_lock_grp, "aio_proc");
 294 static LCK_GRP_DECLARE(aio_queue_lock_grp, "aio_queue");
 295 static LCK_MTX_DECLARE(aio_proc_mtx, &aio_proc_lock_grp);
 296
 297 static ZONE_DECLARE(aio_workq_zonep, "aiowq", sizeof(aio_workq_entry),
 298     ZC_ZFREE_CLEARMEM);
 299
 300 /* Hash */
 301 static aio_workq_t
 302 aio_entry_workq(__unused aio_workq_entry *entryp)
 303 {
 304         return &aio_anchor.aio_async_workqs[0];
 305 }
 306
 307 static void
 308 aio_workq_init(aio_workq_t wq)
 309 {
 310         TAILQ_INIT(&wq->aioq_entries);
 311         lck_spin_init(&wq->aioq_lock, &aio_queue_lock_grp, LCK_ATTR_NULL);
 312         waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
 313 }
 314
 315
 316 /*
 317  * Can be passed a queue which is locked spin.
 318  */
 319 static void
 320 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 321 {
 322         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 323
 324         if (entryp->aio_workq_link.tqe_prev == NULL) {
 325                 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
 326         }
 327
 328         TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
 329         entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
 330 }
 331
 332 static void
 333 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 334 {
 335         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 336
 337         TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
 338 }
 339
 340 static void
 341 aio_proc_lock(proc_t procp)
 342 {
 343         lck_mtx_lock(aio_proc_mutex(procp));
 344 }
 345
 346 static void
 347 aio_proc_lock_spin(proc_t procp)
 348 {
 349         lck_mtx_lock_spin(aio_proc_mutex(procp));
 350 }
 351
 352 static bool
 353 aio_has_any_work(void)
 354 {
 355         return os_atomic_load(&aio_anchor.aio_total_count, relaxed) != 0;
 356 }
 357
 358 static bool
 359 aio_try_proc_insert_active_locked(proc_t procp, aio_workq_entry *entryp)
 360 {
 361         int old, new;
 362
 363         ASSERT_AIO_PROC_LOCK_OWNED(procp);
 364
 365         if (procp->p_aio_total_count >= aio_max_requests_per_process) {
 366                 return false;
 367         }
 368
 369         if (is_already_queued(procp, entryp->uaiocbp)) {
 370                 return false;
 371         }
 372
 373         os_atomic_rmw_loop(&aio_anchor.aio_total_count, old, new, relaxed, {
 374                 if (old >= aio_max_requests) {
 375                         os_atomic_rmw_loop_give_up(return false);
 376                 }
 377                 new = old + 1;
 378         });
 379
 380         TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
 381         procp->p_aio_total_count++;
 382         return true;
 383 }
 384
 385 static void
 386 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
 387 {
 388         TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link);
 389         TAILQ_INSERT_TAIL(&procp->p_aio_doneq, entryp, aio_proc_link);
 390 }
 391
 392 static void
 393 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
 394 {
 395         TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
 396         entryp->aio_proc_link.tqe_prev = NULL;
 397         if (os_atomic_dec_orig(&aio_anchor.aio_total_count, relaxed) <= 0) {
 398                 panic("Negative total AIO count!\n");
 399         }
 400         if (procp->p_aio_total_count-- <= 0) {
 401                 panic("proc %p: p_aio_total_count accounting mismatch", procp);
 402         }
 403 }
 404
 405 static void
 406 aio_proc_unlock(proc_t procp)
 407 {
 408         lck_mtx_unlock(aio_proc_mutex(procp));
 409 }
 410
 411 static lck_mtx_t*
 412 aio_proc_mutex(proc_t procp)
 413 {
 414         return &procp->p_mlock;
 415 }
 416
 417 static void
 418 aio_entry_ref(aio_workq_entry *entryp)
 419 {
 420         os_ref_retain(&entryp->aio_refcount);
 421 }
 422
 423 static void
 424 aio_entry_unref(aio_workq_entry *entryp)
 425 {
 426         if (os_ref_release(&entryp->aio_refcount) == 0) {
 427                 aio_free_request(entryp);
 428         }
 429 }
 430
 431 static bool
 432 aio_entry_try_workq_remove(aio_workq_entry *entryp)
 433 {
 434         /* Can only be cancelled if it's still on a work queue */
 435         if (entryp->aio_workq_link.tqe_prev != NULL) {
 436                 aio_workq_t queue;
 437
 438                 /* Will have to check again under the lock */
 439                 queue = aio_entry_workq(entryp);
 440                 aio_workq_lock_spin(queue);
 441                 if (entryp->aio_workq_link.tqe_prev != NULL) {
 442                         aio_workq_remove_entry_locked(queue, entryp);
 443                         aio_workq_unlock(queue);
 444                         return true;
 445                 } else {
 446                         aio_workq_unlock(queue);
 447                 }
 448         }
 449
 450         return false;
 451 }
 452
 453 static void
 454 aio_workq_lock_spin(aio_workq_t wq)
 455 {
 456         lck_spin_lock(aio_workq_lock(wq));
 457 }
 458
 459 static void
 460 aio_workq_unlock(aio_workq_t wq)
 461 {
 462         lck_spin_unlock(aio_workq_lock(wq));
 463 }
 464
 465 static lck_spin_t*
 466 aio_workq_lock(aio_workq_t wq)
 467 {
 468         return &wq->aioq_lock;
 469 }
 470
 471 /*
 472  * aio_cancel - attempt to cancel one or more async IO requests currently
 473  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 474  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 475  * is NULL then all outstanding async IO request for the given file
 476  * descriptor are cancelled (if possible).
 477  */
 478 int
 479 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval)
 480 {
 481         struct user_aiocb my_aiocb;
 482         int               result;
 483
 484         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_START,
 485             VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
 486
 487         /* quick check to see if there are any async IO requests queued up */
 488         if (!aio_has_any_work()) {
 489                 result = 0;
 490                 *retval = AIO_ALLDONE;
 491                 goto ExitRoutine;
 492         }
 493
 494         *retval = -1;
 495         if (uap->aiocbp != USER_ADDR_NULL) {
 496                 if (proc_is64bit(p)) {
 497                         struct user64_aiocb aiocb64;
 498
 499                         result = copyin(uap->aiocbp, &aiocb64, sizeof(aiocb64));
 500                         if (result == 0) {
 501                                 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
 502                         }
 503                 } else {
 504                         struct user32_aiocb aiocb32;
 505
 506                         result = copyin(uap->aiocbp, &aiocb32, sizeof(aiocb32));
 507                         if (result == 0) {
 508                                 do_munge_aiocb_user32_to_user(&aiocb32, &my_aiocb);
 509                         }
 510                 }
 511
 512                 if (result != 0) {
 513                         result = EAGAIN;
 514                         goto ExitRoutine;
 515                 }
 516
 517                 /* NOTE - POSIX standard says a mismatch between the file */
 518                 /* descriptor passed in and the file descriptor embedded in */
 519                 /* the aiocb causes unspecified results.  We return EBADF in */
 520                 /* that situation.  */
 521                 if (uap->fd != my_aiocb.aio_fildes) {
 522                         result = EBADF;
 523                         goto ExitRoutine;
 524                 }
 525         }
 526
 527         aio_proc_lock(p);
 528         result = do_aio_cancel_locked(p, uap->fd, uap->aiocbp, 0);
 529         ASSERT_AIO_PROC_LOCK_OWNED(p);
 530         aio_proc_unlock(p);
 531
 532         if (result != -1) {
 533                 *retval = result;
 534                 result = 0;
 535                 goto ExitRoutine;
 536         }
 537
 538         result = EBADF;
 539
 540 ExitRoutine:
 541         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_END,
 542             VM_KERNEL_ADDRPERM(p), uap->aiocbp, result, 0, 0);
 543
 544         return result;
 545 }
 546
 547
 548 /*
 549  * _aio_close - internal function used to clean up async IO requests for
 550  * a file descriptor that is closing.
 551  * THIS MAY BLOCK.
 552  */
 553 __private_extern__ void
 554 _aio_close(proc_t p, int fd)
 555 {
 556         int error;
 557
 558         /* quick check to see if there are any async IO requests queued up */
 559         if (!aio_has_any_work()) {
 560                 return;
 561         }
 562
 563         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_START,
 564             VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
 565
 566         /* cancel all async IO requests on our todo queues for this file descriptor */
 567         aio_proc_lock(p);
 568         error = do_aio_cancel_locked(p, fd, USER_ADDR_NULL, AIO_CLOSE_WAIT);
 569         ASSERT_AIO_PROC_LOCK_OWNED(p);
 570         if (error == AIO_NOTCANCELED) {
 571                 /*
 572                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 573                  * and file descriptor on the active async IO queue.  Active requests cannot
 574                  * be cancelled so we must wait for them to complete.  We will get a special
 575                  * wake up call on our channel used to sleep for ALL active requests to
 576                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 577                  * when we must wait for all active aio requests.
 578                  */
 579
 580                 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep) | DBG_FUNC_NONE,
 581                     VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
 582
 583                 while (aio_proc_has_active_requests_for_file(p, fd)) {
 584                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0);
 585                 }
 586         }
 587
 588         aio_proc_unlock(p);
 589
 590         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_END,
 591             VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
 592 }
 593
 594
 595 /*
 596  * aio_error - return the error status associated with the async IO
 597  * request referred to by uap->aiocbp.  The error status is the errno
 598  * value that would be set by the corresponding IO request (read, wrtie,
 599  * fdatasync, or sync).
 600  */
 601 int
 602 aio_error(proc_t p, struct aio_error_args *uap, int *retval)
 603 {
 604         aio_workq_entry *entryp;
 605         int              error;
 606
 607         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_START,
 608             VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
 609
 610         /* see if there are any aios to check */
 611         if (!aio_has_any_work()) {
 612                 return EINVAL;
 613         }
 614
 615         aio_proc_lock(p);
 616
 617         /* look for a match on our queue of async IO requests that have completed */
 618         TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
 619                 if (entryp->uaiocbp == uap->aiocbp) {
 620                         ASSERT_AIO_FROM_PROC(entryp, p);
 621
 622                         *retval = entryp->errorval;
 623                         error = 0;
 624
 625                         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val) | DBG_FUNC_NONE,
 626                             VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
 627                         goto ExitRoutine;
 628                 }
 629         }
 630
 631         /* look for a match on our queue of active async IO requests */
 632         TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
 633                 if (entryp->uaiocbp == uap->aiocbp) {
 634                         ASSERT_AIO_FROM_PROC(entryp, p);
 635                         *retval = EINPROGRESS;
 636                         error = 0;
 637                         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq) | DBG_FUNC_NONE,
 638                             VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
 639                         goto ExitRoutine;
 640                 }
 641         }
 642
 643         error = EINVAL;
 644
 645 ExitRoutine:
 646         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_END,
 647             VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
 648         aio_proc_unlock(p);
 649
 650         return error;
 651 }
 652
 653
 654 /*
 655  * aio_fsync - asynchronously force all IO operations associated
 656  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 657  * queued at the time of the call to the synchronized completion state.
 658  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 659  * fdatasync() call.
 660  */
 661 int
 662 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval)
 663 {
 664         aio_entry_flags_t fsync_kind;
 665         int error;
 666
 667         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_START,
 668             VM_KERNEL_ADDRPERM(p), uap->aiocbp, uap->op, 0, 0);
 669
 670         *retval = 0;
 671         /* 0 := O_SYNC for binary backward compatibility with Panther */
 672         if (uap->op == O_SYNC || uap->op == 0) {
 673                 fsync_kind = AIO_FSYNC;
 674         } else if (uap->op == O_DSYNC) {
 675                 fsync_kind = AIO_DSYNC;
 676         } else {
 677                 *retval = -1;
 678                 error = EINVAL;
 679                 goto ExitRoutine;
 680         }
 681
 682         error = aio_queue_async_request(p, uap->aiocbp, fsync_kind);
 683         if (error != 0) {
 684                 *retval = -1;
 685         }
 686
 687 ExitRoutine:
 688         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_END,
 689             VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
 690
 691         return error;
 692 }
 693
 694
 695 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 696  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 697  * (uap->aiocbp->aio_buf).
 698  */
 699 int
 700 aio_read(proc_t p, struct aio_read_args *uap, int *retval)
 701 {
 702         int error;
 703
 704         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_START,
 705             VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
 706
 707         *retval = 0;
 708
 709         error = aio_queue_async_request(p, uap->aiocbp, AIO_READ);
 710         if (error != 0) {
 711                 *retval = -1;
 712         }
 713
 714         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_END,
 715             VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
 716
 717         return error;
 718 }
 719
 720
 721 /*
 722  * aio_return - return the return status associated with the async IO
 723  * request referred to by uap->aiocbp.  The return status is the value
 724  * that would be returned by corresponding IO request (read, write,
 725  * fdatasync, or sync).  This is where we release kernel resources
 726  * held for async IO call associated with the given aiocb pointer.
 727  */
 728 int
 729 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval)
 730 {
 731         aio_workq_entry *entryp;
 732         int              error = EINVAL;
 733
 734         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_START,
 735             VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
 736
 737         /* See if there are any entries to check */
 738         if (!aio_has_any_work()) {
 739                 goto ExitRoutine;
 740         }
 741
 742         aio_proc_lock(p);
 743         *retval = 0;
 744
 745         /* look for a match on our queue of async IO requests that have completed */
 746         TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
 747                 ASSERT_AIO_FROM_PROC(entryp, p);
 748                 if (entryp->uaiocbp == uap->aiocbp) {
 749                         /* Done and valid for aio_return(), pull it off the list */
 750                         aio_proc_remove_done_locked(p, entryp);
 751
 752                         *retval = entryp->returnval;
 753                         error = 0;
 754                         aio_proc_unlock(p);
 755
 756                         aio_entry_unref(entryp);
 757
 758                         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val) | DBG_FUNC_NONE,
 759                             VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
 760                         goto ExitRoutine;
 761                 }
 762         }
 763
 764         /* look for a match on our queue of active async IO requests */
 765         TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
 766                 ASSERT_AIO_FROM_PROC(entryp, p);
 767                 if (entryp->uaiocbp == uap->aiocbp) {
 768                         error = EINPROGRESS;
 769                         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq) | DBG_FUNC_NONE,
 770                             VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
 771                         break;
 772                 }
 773         }
 774
 775         aio_proc_unlock(p);
 776
 777 ExitRoutine:
 778         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_END,
 779             VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
 780
 781         return error;
 782 }
 783
 784
 785 /*
 786  * _aio_exec - internal function used to clean up async IO requests for
 787  * a process that is going away due to exec().  We cancel any async IOs
 788  * we can and wait for those already active.  We also disable signaling
 789  * for cancelled or active aio requests that complete.
 790  * This routine MAY block!
 791  */
 792 __private_extern__ void
 793 _aio_exec(proc_t p)
 794 {
 795         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_START,
 796             VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
 797
 798         _aio_exit(p);
 799
 800         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_END,
 801             VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
 802 }
 803
 804
 805 /*
 806  * _aio_exit - internal function used to clean up async IO requests for
 807  * a process that is terminating (via exit() or exec()).  We cancel any async IOs
 808  * we can and wait for those already active.  We also disable signaling
 809  * for cancelled or active aio requests that complete.  This routine MAY block!
 810  */
 811 __private_extern__ void
 812 _aio_exit(proc_t p)
 813 {
 814         TAILQ_HEAD(, aio_workq_entry) tofree = TAILQ_HEAD_INITIALIZER(tofree);
 815         aio_workq_entry *entryp, *tmp;
 816         int              error;
 817
 818         /* quick check to see if there are any async IO requests queued up */
 819         if (!aio_has_any_work()) {
 820                 return;
 821         }
 822
 823         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_START,
 824             VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
 825
 826         aio_proc_lock(p);
 827
 828         /*
 829          * cancel async IO requests on the todo work queue and wait for those
 830          * already active to complete.
 831          */
 832         error = do_aio_cancel_locked(p, -1, USER_ADDR_NULL, AIO_EXIT_WAIT);
 833         ASSERT_AIO_PROC_LOCK_OWNED(p);
 834         if (error == AIO_NOTCANCELED) {
 835                 /*
 836                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 837                  * on the active async IO queue.  Active requests cannot be cancelled so we
 838                  * must wait for them to complete.  We will get a special wake up call on
 839                  * our channel used to sleep for ALL active requests to complete.  This sleep
 840                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 841                  * active aio requests.
 842                  */
 843
 844                 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep) | DBG_FUNC_NONE,
 845                     VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
 846
 847                 while (aio_has_active_requests_for_process(p)) {
 848                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0);
 849                 }
 850         }
 851
 852         assert(!aio_has_active_requests_for_process(p));
 853
 854         /* release all aio resources used by this process */
 855         TAILQ_FOREACH_SAFE(entryp, &p->p_aio_doneq, aio_proc_link, tmp) {
 856                 ASSERT_AIO_FROM_PROC(entryp, p);
 857
 858                 aio_proc_remove_done_locked(p, entryp);
 859                 TAILQ_INSERT_TAIL(&tofree, entryp, aio_proc_link);
 860         }
 861
 862         aio_proc_unlock(p);
 863
 864         /* free all the entries outside of the aio_proc_lock() */
 865         TAILQ_FOREACH_SAFE(entryp, &tofree, aio_proc_link, tmp) {
 866                 entryp->aio_proc_link.tqe_prev = NULL;
 867                 aio_entry_unref(entryp);
 868         }
 869
 870         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_END,
 871             VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
 872 }
 873
 874
 875 static bool
 876 should_cancel(aio_workq_entry *entryp, int fd, user_addr_t aiocbp,
 877     aio_entry_flags_t reason)
 878 {
 879         if (reason & AIO_EXIT_WAIT) {
 880                 /* caller is _aio_exit() */
 881                 return true;
 882         }
 883         if (fd != entryp->aiocb.aio_fildes) {
 884                 /* not the file we're looking for */
 885                 return false;
 886         }
 887         /*
 888          * aio_cancel() or _aio_close() cancel
 889          * everything for a given fd when aiocbp is NULL
 890          */
 891         return aiocbp == USER_ADDR_NULL || entryp->uaiocbp == aiocbp;
 892 }
 893
 894 /*
 895  * do_aio_cancel_locked - cancel async IO requests (if possible).  We get called by
 896  * aio_cancel, close, and at exit.
 897  * There are three modes of operation: 1) cancel all async IOs for a process -
 898  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 899  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 900  * aiocbp.
 901  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 902  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 903  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 904  * were already complete.
 905  * WARNING - do not deference aiocbp in this routine, it may point to user
 906  * land data that has not been copied in (when called from aio_cancel())
 907  *
 908  * Called with proc locked, and returns the same way.
 909  */
 910 static int
 911 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
 912     aio_entry_flags_t reason)
 913 {
 914         bool multiple_matches = (aiocbp == USER_ADDR_NULL);
 915         aio_workq_entry *entryp, *tmp;
 916         int result;
 917
 918         ASSERT_AIO_PROC_LOCK_OWNED(p);
 919
 920         /* look for a match on our queue of async todo work. */
 921 again:
 922         result = -1;
 923         TAILQ_FOREACH_SAFE(entryp, &p->p_aio_activeq, aio_proc_link, tmp) {
 924                 ASSERT_AIO_FROM_PROC(entryp, p);
 925
 926                 if (!should_cancel(entryp, fd, aiocbp, reason)) {
 927                         continue;
 928                 }
 929
 930                 if (reason) {
 931                         /* mark the entry as blocking close or exit/exec */
 932                         entryp->flags |= reason;
 933                         if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
 934                                 panic("Close and exit flags set at the same time\n");
 935                         }
 936                 }
 937
 938                 /* Can only be cancelled if it's still on a work queue */
 939                 if (aio_entry_try_workq_remove(entryp)) {
 940                         entryp->errorval = ECANCELED;
 941                         entryp->returnval = -1;
 942
 943                         /* Now it's officially cancelled.  Do the completion */
 944                         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) | DBG_FUNC_NONE,
 945                             VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
 946                             fd, 0, 0);
 947                         do_aio_completion_and_unlock(p, entryp);
 948
 949                         aio_proc_lock(p);
 950
 951                         if (multiple_matches) {
 952                                 /*
 953                                  * Restart from the head of the proc active queue since it
 954                                  * may have been changed while we were away doing completion
 955                                  * processing.
 956                                  *
 957                                  * Note that if we found an uncancellable AIO before, we will
 958                                  * either find it again or discover that it's been completed,
 959                                  * so resetting the result will not cause us to return success
 960                                  * despite outstanding AIOs.
 961                                  */
 962                                 goto again;
 963                         }
 964
 965                         return AIO_CANCELED;
 966                 }
 967
 968                 /*
 969                  * It's been taken off the active queue already, i.e. is in flight.
 970                  * All we can do is ask for notification.
 971                  */
 972                 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq) | DBG_FUNC_NONE,
 973                     VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
 974                     fd, 0, 0);
 975
 976                 result = AIO_NOTCANCELED;
 977                 if (!multiple_matches) {
 978                         return result;
 979                 }
 980         }
 981
 982         /*
 983          * if we didn't find any matches on the todo or active queues then look for a
 984          * match on our queue of async IO requests that have completed and if found
 985          * return AIO_ALLDONE result.
 986          *
 987          * Proc AIO lock is still held.
 988          */
 989         if (result == -1) {
 990                 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
 991                         ASSERT_AIO_FROM_PROC(entryp, p);
 992                         if (should_cancel(entryp, fd, aiocbp, reason)) {
 993                                 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq) | DBG_FUNC_NONE,
 994                                     VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
 995                                     fd, 0, 0);
 996
 997                                 result = AIO_ALLDONE;
 998                                 if (!multiple_matches) {
 999                                         return result;
1000                                 }
1001                         }
1002                 }
1003         }
1004
1005         return result;
1006 }
1007
1008
1009 /*
1010  * aio_suspend - suspend the calling thread until at least one of the async
1011  * IO operations referenced by uap->aiocblist has completed, until a signal
1012  * interrupts the function, or uap->timeoutp time interval (optional) has
1013  * passed.
1014  * Returns 0 if one or more async IOs have completed else -1 and errno is
1015  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1016  * woke us up.
1017  */
1018 int
1019 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval)
1020 {
1021         __pthread_testcancel(1);
1022         return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
1023 }
1024
1025
1026 int
1027 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval)
1028 {
1029         int                     error;
1030         int                     i;
1031         uint64_t                abstime;
1032         struct user_timespec    ts;
1033         aio_workq_entry        *entryp;
1034         user_addr_t            *aiocbpp;
1035         size_t                  aiocbpp_size;
1036
1037         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_START,
1038             VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1039
1040         *retval = -1;
1041         abstime = 0;
1042         aiocbpp = NULL;
1043
1044         if (!aio_has_any_work()) {
1045                 error = EINVAL;
1046                 goto ExitThisRoutine;
1047         }
1048
1049         if (uap->nent < 1 || uap->nent > aio_max_requests_per_process ||
1050             os_mul_overflow(sizeof(user_addr_t), uap->nent, &aiocbpp_size)) {
1051                 error = EINVAL;
1052                 goto ExitThisRoutine;
1053         }
1054
1055         if (uap->timeoutp != USER_ADDR_NULL) {
1056                 if (proc_is64bit(p)) {
1057                         struct user64_timespec temp;
1058                         error = copyin(uap->timeoutp, &temp, sizeof(temp));
1059                         if (error == 0) {
1060                                 ts.tv_sec = (user_time_t)temp.tv_sec;
1061                                 ts.tv_nsec = (user_long_t)temp.tv_nsec;
1062                         }
1063                 } else {
1064                         struct user32_timespec temp;
1065                         error = copyin(uap->timeoutp, &temp, sizeof(temp));
1066                         if (error == 0) {
1067                                 ts.tv_sec = temp.tv_sec;
1068                                 ts.tv_nsec = temp.tv_nsec;
1069                         }
1070                 }
1071                 if (error != 0) {
1072                         error = EAGAIN;
1073                         goto ExitThisRoutine;
1074                 }
1075
1076                 if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
1077                         error = EINVAL;
1078                         goto ExitThisRoutine;
1079                 }
1080
1081                 nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1082                     &abstime);
1083                 clock_absolutetime_interval_to_deadline(abstime, &abstime);
1084         }
1085
1086         aiocbpp = kheap_alloc(KHEAP_TEMP, aiocbpp_size, Z_WAITOK);
1087         if (aiocbpp == NULL || aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1088                 error = EAGAIN;
1089                 goto ExitThisRoutine;
1090         }
1091
1092         /* check list of aio requests to see if any have completed */
1093 check_for_our_aiocbp:
1094         aio_proc_lock_spin(p);
1095         for (i = 0; i < uap->nent; i++) {
1096                 user_addr_t     aiocbp;
1097
1098                 /* NULL elements are legal so check for 'em */
1099                 aiocbp = *(aiocbpp + i);
1100                 if (aiocbp == USER_ADDR_NULL) {
1101                         continue;
1102                 }
1103
1104                 /* return immediately if any aio request in the list is done */
1105                 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1106                         ASSERT_AIO_FROM_PROC(entryp, p);
1107                         if (entryp->uaiocbp == aiocbp) {
1108                                 aio_proc_unlock(p);
1109                                 *retval = 0;
1110                                 error = 0;
1111                                 goto ExitThisRoutine;
1112                         }
1113                 }
1114         }
1115
1116         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep) | DBG_FUNC_NONE,
1117             VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1118
1119         /*
1120          * wait for an async IO to complete or a signal fires or timeout expires.
1121          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1122          * interrupts us.  If an async IO completes before a signal fires or our
1123          * timeout expires, we get a wakeup call from aio_work_thread().
1124          */
1125
1126         error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p),
1127             PCATCH | PWAIT | PDROP, "aio_suspend", abstime);
1128         if (error == 0) {
1129                 /*
1130                  * got our wakeup call from aio_work_thread().
1131                  * Since we can get a wakeup on this channel from another thread in the
1132                  * same process we head back up to make sure this is for the correct aiocbp.
1133                  * If it is the correct aiocbp we will return from where we do the check
1134                  * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1135                  * else we will fall out and just sleep again.
1136                  */
1137                 goto check_for_our_aiocbp;
1138         } else if (error == EWOULDBLOCK) {
1139                 /* our timeout expired */
1140                 error = EAGAIN;
1141         } else {
1142                 /* we were interrupted */
1143                 error = EINTR;
1144         }
1145
1146 ExitThisRoutine:
1147         if (aiocbpp != NULL) {
1148                 kheap_free(KHEAP_TEMP, aiocbpp, aiocbpp_size);
1149         }
1150
1151         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_END,
1152             VM_KERNEL_ADDRPERM(p), uap->nent, error, 0, 0);
1153
1154         return error;
1155 }
1156
1157
1158 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1159  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1160  * (uap->aiocbp->aio_buf).
1161  */
1162
1163 int
1164 aio_write(proc_t p, struct aio_write_args *uap, int *retval __unused)
1165 {
1166         int error;
1167
1168         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_START,
1169             VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
1170
1171         error = aio_queue_async_request(p, uap->aiocbp, AIO_WRITE);
1172
1173         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_END,
1174             VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
1175
1176         return error;
1177 }
1178
1179
1180 static int
1181 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, user_addr_t *aiocbpp,
1182     int nent)
1183 {
1184         int result;
1185
1186         /* copyin our aiocb pointers from list */
1187         result = copyin(aiocblist, aiocbpp,
1188             proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1189             : (nent * sizeof(user32_addr_t)));
1190         if (result) {
1191                 return result;
1192         }
1193
1194         /*
1195          * We depend on a list of user_addr_t's so we need to
1196          * munge and expand when these pointers came from a
1197          * 32-bit process
1198          */
1199         if (!proc_is64bit(procp)) {
1200                 /* copy from last to first to deal with overlap */
1201                 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1202                 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1203
1204                 for (int i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1205                         *my_addrp = (user_addr_t) (*my_ptrp);
1206                 }
1207         }
1208
1209         return 0;
1210 }
1211
1212
1213 static int
1214 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1215 {
1216         int     result = 0;
1217
1218         if (sigp == USER_ADDR_NULL) {
1219                 goto out;
1220         }
1221
1222         /*
1223          * We need to munge aio_sigevent since it contains pointers.
1224          * Since we do not know if sigev_value is an int or a ptr we do
1225          * NOT cast the ptr to a user_addr_t.   This means if we send
1226          * this info back to user space we need to remember sigev_value
1227          * was not expanded for the 32-bit case.
1228          *
1229          * Notes:        This does NOT affect us since we don't support
1230          *              sigev_value yet in the aio context.
1231          */
1232         if (proc_is64bit(procp)) {
1233 #if __LP64__
1234                 struct user64_sigevent sigevent64;
1235
1236                 result = copyin(sigp, &sigevent64, sizeof(sigevent64));
1237                 if (result == 0) {
1238                         sigev->sigev_notify = sigevent64.sigev_notify;
1239                         sigev->sigev_signo = sigevent64.sigev_signo;
1240                         sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1241                         sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1242                         sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1243                 }
1244 #else
1245                 panic("64bit process on 32bit kernel is not supported");
1246 #endif
1247         } else {
1248                 struct user32_sigevent sigevent32;
1249
1250                 result = copyin(sigp, &sigevent32, sizeof(sigevent32));
1251                 if (result == 0) {
1252                         sigev->sigev_notify = sigevent32.sigev_notify;
1253                         sigev->sigev_signo = sigevent32.sigev_signo;
1254                         sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1255                         sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1256                         sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1257                 }
1258         }
1259
1260         if (result != 0) {
1261                 result = EAGAIN;
1262         }
1263
1264 out:
1265         return result;
1266 }
1267
1268 /*
1269  * validate user_sigevent.  at this point we only support
1270  * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
1271  * sigev_value, sigev_notify_function, and sigev_notify_attributes
1272  * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
1273  * with no [RTS] (RalTime Signal) option group support.
1274  */
1275 static int
1276 aio_sigev_validate(const struct user_sigevent *sigev)
1277 {
1278         switch (sigev->sigev_notify) {
1279         case SIGEV_SIGNAL:
1280         {
1281                 int signum;
1282
1283                 /* make sure we have a valid signal number */
1284                 signum = sigev->sigev_signo;
1285                 if (signum <= 0 || signum >= NSIG ||
1286                     signum == SIGKILL || signum == SIGSTOP) {
1287                         return EINVAL;
1288                 }
1289         }
1290         break;
1291
1292         case SIGEV_NONE:
1293                 break;
1294
1295         case SIGEV_THREAD:
1296         /* Unsupported [RTS] */
1297
1298         default:
1299                 return EINVAL;
1300         }
1301
1302         return 0;
1303 }
1304
1305
1306 /*
1307  * aio_try_enqueue_work_locked
1308  *
1309  * Queue up the entry on the aio asynchronous work queue in priority order
1310  * based on the relative priority of the request.  We calculate the relative
1311  * priority using the nice value of the caller and the value
1312  *
1313  * Parameters:  procp                   Process queueing the I/O
1314  *              entryp                  The work queue entry being queued
1315  *              leader                  The work leader if any
1316  *
1317  * Returns:     Wether the enqueue was successful
1318  *
1319  * Notes:       This function is used for both lio_listio and aio
1320  *
1321  * XXX:         At some point, we may have to consider thread priority
1322  *              rather than process priority, but we don't maintain the
1323  *              adjusted priority for threads the POSIX way.
1324  *
1325  * Called with proc locked.
1326  */
1327 static bool
1328 aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp,
1329     aio_workq_entry *leader)
1330 {
1331         aio_workq_t queue = aio_entry_workq(entryp);
1332
1333         ASSERT_AIO_PROC_LOCK_OWNED(procp);
1334
1335         /* Onto proc queue */
1336         if (!aio_try_proc_insert_active_locked(procp, entryp)) {
1337                 return false;
1338         }
1339
1340         if (leader) {
1341                 aio_entry_ref(leader); /* consumed in do_aio_completion_and_unlock */
1342                 leader->lio_pending++;
1343                 entryp->lio_leader = leader;
1344         }
1345
1346         /* And work queue */
1347         aio_entry_ref(entryp); /* consumed in do_aio_completion_and_unlock */
1348         aio_workq_lock_spin(queue);
1349         aio_workq_add_entry_locked(queue, entryp);
1350         waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1351             THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
1352         aio_workq_unlock(queue);
1353
1354         KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_START,
1355             VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1356             entryp->flags, entryp->aiocb.aio_fildes, 0);
1357         KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_END,
1358             entryp->aiocb.aio_offset, 0, entryp->aiocb.aio_nbytes, 0, 0);
1359         return true;
1360 }
1361
1362
1363 /*
1364  * lio_listio - initiate a list of IO requests.  We process the list of
1365  * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1366  * (mode == LIO_NOWAIT).
1367  *
1368  * The caller gets error and return status for each aiocb in the list
1369  * via aio_error and aio_return.  We must keep completed requests until
1370  * released by the aio_return call.
1371  */
1372 int
1373 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval __unused)
1374 {
1375         aio_workq_entry         *entries[AIO_LISTIO_MAX] = { };
1376         user_addr_t              aiocbpp[AIO_LISTIO_MAX];
1377         struct user_sigevent     aiosigev = { };
1378         int                      result = 0;
1379         int                      lio_count = 0;
1380
1381         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_START,
1382             VM_KERNEL_ADDRPERM(p), uap->nent, uap->mode, 0, 0);
1383
1384         if (!(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT)) {
1385                 result = EINVAL;
1386                 goto ExitRoutine;
1387         }
1388
1389         if (uap->nent < 1 || uap->nent > AIO_LISTIO_MAX) {
1390                 result = EINVAL;
1391                 goto ExitRoutine;
1392         }
1393
1394         /*
1395          * Use sigevent passed in to lio_listio for each of our calls, but
1396          * only do completion notification after the last request completes.
1397          */
1398         if (uap->sigp != USER_ADDR_NULL) {
1399                 result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1400                 if (result) {
1401                         goto ExitRoutine;
1402                 }
1403                 result = aio_sigev_validate(&aiosigev);
1404                 if (result) {
1405                         goto ExitRoutine;
1406                 }
1407         }
1408
1409         if (aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1410                 result = EAGAIN;
1411                 goto ExitRoutine;
1412         }
1413
1414         /*
1415          * allocate/parse all entries
1416          */
1417         for (int i = 0; i < uap->nent; i++) {
1418                 aio_workq_entry *entryp;
1419
1420                 /* NULL elements are legal so check for 'em */
1421                 if (aiocbpp[i] == USER_ADDR_NULL) {
1422                         continue;
1423                 }
1424
1425                 entryp = aio_create_queue_entry(p, aiocbpp[i], AIO_LIO);
1426                 if (entryp == NULL) {
1427                         result = EAGAIN;
1428                         goto ExitRoutine;
1429                 }
1430
1431                 /*
1432                  * This refcount is cleaned up on exit if the entry
1433                  * isn't submitted
1434                  */
1435                 entries[lio_count++] = entryp;
1436                 if (uap->mode == LIO_NOWAIT) {
1437                         /* Set signal hander, if any */
1438                         entryp->aiocb.aio_sigevent = aiosigev;
1439                 }
1440         }
1441
1442         if (lio_count == 0) {
1443                 /* There's nothing to submit */
1444                 goto ExitRoutine;
1445         }
1446
1447         /*
1448          * Past this point we're commited and will not bail out
1449          *
1450          * - keep a reference on the leader for LIO_WAIT
1451          * - perform the submissions and optionally wait
1452          */
1453
1454         aio_workq_entry *leader = entries[0];
1455         if (uap->mode == LIO_WAIT) {
1456                 aio_entry_ref(leader); /* consumed below */
1457         }
1458
1459         aio_proc_lock_spin(p);
1460
1461         for (int i = 0; i < lio_count; i++) {
1462                 if (aio_try_enqueue_work_locked(p, entries[i], leader)) {
1463                         entries[i] = NULL; /* the entry was submitted */
1464                 } else {
1465                         result = EAGAIN;
1466                 }
1467         }
1468
1469         if (uap->mode == LIO_WAIT && result == 0) {
1470                 leader->flags |= AIO_LIO_WAIT;
1471
1472                 while (leader->lio_pending) {
1473                         /* If we were interrupted, fail out (even if all finished) */
1474                         if (msleep(leader, aio_proc_mutex(p),
1475                             PCATCH | PRIBIO | PSPIN, "lio_listio", 0) != 0) {
1476                                 result = EINTR;
1477                                 break;
1478                         }
1479                 }
1480
1481                 leader->flags &= ~AIO_LIO_WAIT;
1482         }
1483
1484         aio_proc_unlock(p);
1485
1486         if (uap->mode == LIO_WAIT) {
1487                 aio_entry_unref(leader);
1488         }
1489
1490 ExitRoutine:
1491         /* Consume unsubmitted entries */
1492         for (int i = 0; i < lio_count; i++) {
1493                 if (entries[i]) {
1494                         aio_entry_unref(entries[i]);
1495                 }
1496         }
1497
1498         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_END,
1499             VM_KERNEL_ADDRPERM(p), result, 0, 0, 0);
1500
1501         return result;
1502 }
1503
1504
1505 /*
1506  * aio worker thread.  this is where all the real work gets done.
1507  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1508  * after new work is queued up.
1509  */
1510 __attribute__((noreturn))
1511 static void
1512 aio_work_thread(void *arg __unused, wait_result_t wr __unused)
1513 {
1514         aio_workq_entry *entryp;
1515         int              error;
1516         vm_map_t         currentmap;
1517         vm_map_t         oldmap = VM_MAP_NULL;
1518         task_t           oldaiotask = TASK_NULL;
1519         struct uthread  *uthreadp = NULL;
1520         proc_t           p = NULL;
1521
1522         for (;;) {
1523                 /*
1524                  * returns with the entry ref'ed.
1525                  * sleeps until work is available.
1526                  */
1527                 entryp = aio_get_some_work();
1528                 p = entryp->procp;
1529
1530                 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_START,
1531                     VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1532                     entryp->flags, 0, 0);
1533
1534                 /*
1535                  * Assume the target's address space identity for the duration
1536                  * of the IO.  Note: don't need to have the entryp locked,
1537                  * because the proc and map don't change until it's freed.
1538                  */
1539                 currentmap = get_task_map((current_proc())->task);
1540                 if (currentmap != entryp->aio_map) {
1541                         uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1542                         oldaiotask = uthreadp->uu_aio_task;
1543                         /*
1544                          * workq entries at this stage cause _aio_exec() and _aio_exit() to
1545                          * block until we hit `do_aio_completion_and_unlock()` below,
1546                          * which means that it is safe to dereference p->task without
1547                          * holding a lock or taking references.
1548                          */
1549                         uthreadp->uu_aio_task = p->task;
1550                         oldmap = vm_map_switch(entryp->aio_map);
1551                 }
1552
1553                 if ((entryp->flags & AIO_READ) != 0) {
1554                         error = do_aio_read(entryp);
1555                 } else if ((entryp->flags & AIO_WRITE) != 0) {
1556                         error = do_aio_write(entryp);
1557                 } else if ((entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0) {
1558                         error = do_aio_fsync(entryp);
1559                 } else {
1560                         error = EINVAL;
1561                 }
1562
1563                 /* Restore old map */
1564                 if (currentmap != entryp->aio_map) {
1565                         vm_map_switch(oldmap);
1566                         uthreadp->uu_aio_task = oldaiotask;
1567                 }
1568
1569                 /* liberate unused map */
1570                 vm_map_deallocate(entryp->aio_map);
1571                 entryp->aio_map = VM_MAP_NULL;
1572
1573                 KERNEL_DEBUG(SDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_END,
1574                     VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1575                     entryp->errorval, entryp->returnval, 0);
1576
1577                 /* we're done with the IO request so pop it off the active queue and */
1578                 /* push it on the done queue */
1579                 aio_proc_lock(p);
1580                 entryp->errorval = error;
1581                 do_aio_completion_and_unlock(p, entryp);
1582         }
1583 }
1584
1585
1586 /*
1587  * aio_get_some_work - get the next async IO request that is ready to be executed.
1588  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1589  * IO requests at the time the aio_fsync call came in have completed.
1590  * NOTE - AIO_LOCK must be held by caller
1591  */
1592 static aio_workq_entry *
1593 aio_get_some_work(void)
1594 {
1595         aio_workq_entry *entryp = NULL;
1596         aio_workq_t      queue = NULL;
1597
1598         /* Just one queue for the moment.  In the future there will be many. */
1599         queue = &aio_anchor.aio_async_workqs[0];
1600         aio_workq_lock_spin(queue);
1601
1602         /*
1603          * Hold the queue lock.
1604          *
1605          * pop some work off the work queue and add to our active queue
1606          * Always start with the queue lock held.
1607          */
1608         while ((entryp = TAILQ_FIRST(&queue->aioq_entries))) {
1609                 /*
1610                  * Pull of of work queue.  Once it's off, it can't be cancelled,
1611                  * so we can take our ref once we drop the queue lock.
1612                  */
1613
1614                 aio_workq_remove_entry_locked(queue, entryp);
1615
1616                 aio_workq_unlock(queue);
1617
1618                 /*
1619                  * Check if it's an fsync that must be delayed.  No need to lock the entry;
1620                  * that flag would have been set at initialization.
1621                  */
1622                 if ((entryp->flags & AIO_FSYNC) != 0) {
1623                         /*
1624                          * Check for unfinished operations on the same file
1625                          * in this proc's queue.
1626                          */
1627                         aio_proc_lock_spin(entryp->procp);
1628                         if (aio_delay_fsync_request(entryp)) {
1629                                 /* It needs to be delayed.  Put it back on the end of the work queue */
1630                                 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) | DBG_FUNC_NONE,
1631                                     VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1632                                     0, 0, 0);
1633
1634                                 aio_proc_unlock(entryp->procp);
1635
1636                                 aio_workq_lock_spin(queue);
1637                                 aio_workq_add_entry_locked(queue, entryp);
1638                                 continue;
1639                         }
1640                         aio_proc_unlock(entryp->procp);
1641                 }
1642
1643                 return entryp;
1644         }
1645
1646         /* We will wake up when someone enqueues something */
1647         waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
1648         aio_workq_unlock(queue);
1649         thread_block(aio_work_thread);
1650
1651         __builtin_unreachable();
1652 }
1653
1654 /*
1655  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1656  * A big, simple hammer: only send it off if it's the most recently filed IO which has
1657  * not been completed.
1658  */
1659 static boolean_t
1660 aio_delay_fsync_request(aio_workq_entry *entryp)
1661 {
1662         if (proc_in_teardown(entryp->procp)) {
1663                 /*
1664                  * we can't delay FSYNCS when in teardown as it will confuse _aio_exit,
1665                  * if it was dequeued, then we must now commit to it
1666                  */
1667                 return FALSE;
1668         }
1669
1670         if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1671                 return FALSE;
1672         }
1673
1674         return TRUE;
1675 }
1676
1677 static aio_workq_entry *
1678 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t flags)
1679 {
1680         aio_workq_entry *entryp;
1681
1682         entryp = zalloc_flags(aio_workq_zonep, Z_WAITOK | Z_ZERO);
1683         entryp->procp = procp;
1684         entryp->uaiocbp = aiocbp;
1685         entryp->flags = flags;
1686         /* consumed in aio_return or _aio_exit */
1687         os_ref_init(&entryp->aio_refcount, &aio_refgrp);
1688
1689         if (proc_is64bit(procp)) {
1690                 struct user64_aiocb aiocb64;
1691
1692                 if (copyin(aiocbp, &aiocb64, sizeof(aiocb64)) != 0) {
1693                         goto error_exit;
1694                 }
1695                 do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1696         } else {
1697                 struct user32_aiocb aiocb32;
1698
1699                 if (copyin(aiocbp, &aiocb32, sizeof(aiocb32)) != 0) {
1700                         goto error_exit;
1701                 }
1702                 do_munge_aiocb_user32_to_user(&aiocb32, &entryp->aiocb);
1703         }
1704
1705         /* do some more validation on the aiocb and embedded file descriptor */
1706         if (aio_validate(procp, entryp) != 0) {
1707                 goto error_exit;
1708         }
1709
1710         /* get a reference to the user land map in order to keep it around */
1711         entryp->aio_map = get_task_map(procp->task);
1712         vm_map_reference(entryp->aio_map);
1713
1714         /* get a reference on the current_thread, which is passed in vfs_context. */
1715         entryp->thread = current_thread();
1716         thread_reference(entryp->thread);
1717         return entryp;
1718
1719 error_exit:
1720         zfree(aio_workq_zonep, entryp);
1721         return NULL;
1722 }
1723
1724
1725 /*
1726  * aio_queue_async_request - queue up an async IO request on our work queue then
1727  * wake up one of our worker threads to do the actual work.  We get a reference
1728  * to our caller's user land map in order to keep it around while we are
1729  * processing the request.
1730  */
1731 static int
1732 aio_queue_async_request(proc_t procp, user_addr_t aiocbp,
1733     aio_entry_flags_t flags)
1734 {
1735         aio_workq_entry *entryp;
1736         int              result;
1737
1738         entryp = aio_create_queue_entry(procp, aiocbp, flags);
1739         if (entryp == NULL) {
1740                 result = EAGAIN;
1741                 goto error_noalloc;
1742         }
1743
1744         aio_proc_lock_spin(procp);
1745         if (!aio_try_enqueue_work_locked(procp, entryp, NULL)) {
1746                 result = EAGAIN;
1747                 goto error_exit;
1748         }
1749         aio_proc_unlock(procp);
1750         return 0;
1751
1752 error_exit:
1753         /*
1754          * This entry has not been queued up so no worries about
1755          * unlocked state and aio_map
1756          */
1757         aio_proc_unlock(procp);
1758         aio_free_request(entryp);
1759 error_noalloc:
1760         return result;
1761 }
1762
1763
1764 /*
1765  * aio_free_request - remove our reference on the user land map and
1766  * free the work queue entry resources.  The entry is off all lists
1767  * and has zero refcount, so no one can have a pointer to it.
1768  */
1769 static void
1770 aio_free_request(aio_workq_entry *entryp)
1771 {
1772         if (entryp->aio_proc_link.tqe_prev || entryp->aio_workq_link.tqe_prev) {
1773                 panic("aio_workq_entry %p being freed while still enqueued", entryp);
1774         }
1775
1776         /* remove our reference to the user land map. */
1777         if (VM_MAP_NULL != entryp->aio_map) {
1778                 vm_map_deallocate(entryp->aio_map);
1779         }
1780
1781         /* remove our reference to thread which enqueued the request */
1782         if (NULL != entryp->thread) {
1783                 thread_deallocate(entryp->thread);
1784         }
1785
1786         zfree(aio_workq_zonep, entryp);
1787 }
1788
1789
1790 /*
1791  * aio_validate
1792  *
1793  * validate the aiocb passed in by one of the aio syscalls.
1794  */
1795 static int
1796 aio_validate(proc_t p, aio_workq_entry *entryp)
1797 {
1798         struct fileproc *fp;
1799         int              flag;
1800         int              result;
1801
1802         result = 0;
1803
1804         if ((entryp->flags & AIO_LIO) != 0) {
1805                 if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
1806                         entryp->flags |= AIO_READ;
1807                 } else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
1808                         entryp->flags |= AIO_WRITE;
1809                 } else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
1810                         return 0;
1811                 } else {
1812                         return EINVAL;
1813                 }
1814         }
1815
1816         flag = FREAD;
1817         if ((entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0) {
1818                 flag = FWRITE;
1819         }
1820
1821         if ((entryp->flags & (AIO_READ | AIO_WRITE)) != 0) {
1822                 if (entryp->aiocb.aio_nbytes > INT_MAX ||
1823                     entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1824                     entryp->aiocb.aio_offset < 0) {
1825                         return EINVAL;
1826                 }
1827         }
1828
1829         result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
1830         if (result) {
1831                 return result;
1832         }
1833
1834         /* validate the file descriptor and that the file was opened
1835          * for the appropriate read / write access.
1836          */
1837         proc_fdlock(p);
1838
1839         fp = fp_get_noref_locked(p, entryp->aiocb.aio_fildes);
1840         if (fp == NULL) {
1841                 result = EBADF;
1842         } else if ((fp->fp_glob->fg_flag & flag) == 0) {
1843                 /* we don't have read or write access */
1844                 result = EBADF;
1845         } else if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_VNODE) {
1846                 /* this is not a file */
1847                 result = ESPIPE;
1848         } else {
1849                 fp->fp_flags |= FP_AIOISSUED;
1850         }
1851
1852         proc_fdunlock(p);
1853
1854         return result;
1855 }
1856
1857 /*
1858  * do_aio_completion_and_unlock.  Handle async IO completion.
1859  */
1860 static void
1861 do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp)
1862 {
1863         aio_workq_entry *leader = entryp->lio_leader;
1864         int              lio_pending = 0;
1865         bool             do_signal = false;
1866
1867         ASSERT_AIO_PROC_LOCK_OWNED(p);
1868
1869         aio_proc_move_done_locked(p, entryp);
1870
1871         if (leader) {
1872                 lio_pending = --leader->lio_pending;
1873                 if (lio_pending < 0) {
1874                         panic("lio_pending accounting mistake");
1875                 }
1876                 if (lio_pending == 0 && (leader->flags & AIO_LIO_WAIT)) {
1877                         wakeup(leader);
1878                 }
1879                 entryp->lio_leader = NULL; /* no dangling pointers please */
1880         }
1881
1882         /*
1883          * need to handle case where a process is trying to exit, exec, or
1884          * close and is currently waiting for active aio requests to complete.
1885          * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
1886          * other requests in the active queue for this process.  If there are
1887          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
1888          * If there are some still active then do nothing - we only want to
1889          * wakeup when all active aio requests for the process are complete.
1890          */
1891         if (__improbable(entryp->flags & AIO_EXIT_WAIT)) {
1892                 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1893                     VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1894                     0, 0, 0);
1895
1896                 if (!aio_has_active_requests_for_process(p)) {
1897                         /*
1898                          * no active aio requests for this process, continue exiting.  In this
1899                          * case, there should be no one else waiting ont he proc in AIO...
1900                          */
1901                         wakeup_one((caddr_t)&p->AIO_CLEANUP_SLEEP_CHAN);
1902
1903                         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1904                             VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1905                             0, 0, 0);
1906                 }
1907         } else if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
1908                 /*
1909                  * If this was the last request in the group, or not part of
1910                  * a group, and that a signal is desired, send one.
1911                  */
1912                 do_signal = (lio_pending == 0);
1913         }
1914
1915         if (__improbable(entryp->flags & AIO_CLOSE_WAIT)) {
1916                 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1917                     VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1918                     0, 0, 0);
1919
1920                 if (!aio_proc_has_active_requests_for_file(p, entryp->aiocb.aio_fildes)) {
1921                         /* Can't wakeup_one(); multiple closes might be in progress. */
1922                         wakeup(&p->AIO_CLEANUP_SLEEP_CHAN);
1923
1924                         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1925                             VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1926                             0, 0, 0);
1927                 }
1928         }
1929
1930         aio_proc_unlock(p);
1931
1932         if (do_signal) {
1933                 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig) | DBG_FUNC_NONE,
1934                     VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1935                     entryp->aiocb.aio_sigevent.sigev_signo, 0, 0);
1936
1937                 psignal(p, entryp->aiocb.aio_sigevent.sigev_signo);
1938         }
1939
1940         /*
1941          * A thread in aio_suspend() wants to known about completed IOs.  If it checked
1942          * the done list before we moved our AIO there, then it already asserted its wait,
1943          * and we can wake it up without holding the lock.  If it checked the list after
1944          * we did our move, then it already has seen the AIO that we moved.  Herego, we
1945          * can do our wakeup without holding the lock.
1946          */
1947         wakeup(&p->AIO_SUSPEND_SLEEP_CHAN);
1948         KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake) | DBG_FUNC_NONE,
1949             VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), 0, 0, 0);
1950
1951         aio_entry_unref(entryp); /* see aio_try_enqueue_work_locked */
1952         if (leader) {
1953                 aio_entry_unref(leader); /* see lio_listio */
1954         }
1955 }
1956
1957
1958 /*
1959  * do_aio_read
1960  */
1961 static int
1962 do_aio_read(aio_workq_entry *entryp)
1963 {
1964         struct proc     *p = entryp->procp;
1965         struct fileproc *fp;
1966         int error;
1967
1968         if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
1969                 return error;
1970         }
1971
1972         if (fp->fp_glob->fg_flag & FREAD) {
1973                 struct vfs_context context = {
1974                         .vc_thread = entryp->thread,     /* XXX */
1975                         .vc_ucred = fp->fp_glob->fg_cred,
1976                 };
1977
1978                 error = dofileread(&context, fp,
1979                     entryp->aiocb.aio_buf,
1980                     entryp->aiocb.aio_nbytes,
1981                     entryp->aiocb.aio_offset, FOF_OFFSET,
1982                     &entryp->returnval);
1983         } else {
1984                 error = EBADF;
1985         }
1986
1987         fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
1988         return error;
1989 }
1990
1991
1992 /*
1993  * do_aio_write
1994  */
1995 static int
1996 do_aio_write(aio_workq_entry *entryp)
1997 {
1998         struct proc     *p = entryp->procp;
1999         struct fileproc *fp;
2000         int error;
2001
2002         if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
2003                 return error;
2004         }
2005
2006         if (fp->fp_glob->fg_flag & FWRITE) {
2007                 struct vfs_context context = {
2008                         .vc_thread = entryp->thread,     /* XXX */
2009                         .vc_ucred = fp->fp_glob->fg_cred,
2010                 };
2011                 int flags = FOF_PCRED;
2012
2013                 if ((fp->fp_glob->fg_flag & O_APPEND) == 0) {
2014                         flags |= FOF_OFFSET;
2015                 }
2016
2017                 /* NB: tell dofilewrite the offset, and to use the proc cred */
2018                 error = dofilewrite(&context,
2019                     fp,
2020                     entryp->aiocb.aio_buf,
2021                     entryp->aiocb.aio_nbytes,
2022                     entryp->aiocb.aio_offset,
2023                     flags,
2024                     &entryp->returnval);
2025         } else {
2026                 error = EBADF;
2027         }
2028
2029         fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2030         return error;
2031 }
2032
2033
2034 /*
2035  * aio_has_active_requests_for_process - return whether the process has active
2036  * requests pending.
2037  */
2038 static bool
2039 aio_has_active_requests_for_process(proc_t procp)
2040 {
2041         return !TAILQ_EMPTY(&procp->p_aio_activeq);
2042 }
2043
2044 /*
2045  * Called with the proc locked.
2046  */
2047 static bool
2048 aio_proc_has_active_requests_for_file(proc_t procp, int fd)
2049 {
2050         aio_workq_entry *entryp;
2051
2052         TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2053                 if (entryp->aiocb.aio_fildes == fd) {
2054                         return true;
2055                 }
2056         }
2057
2058         return false;
2059 }
2060
2061
2062 /*
2063  * do_aio_fsync
2064  */
2065 static int
2066 do_aio_fsync(aio_workq_entry *entryp)
2067 {
2068         struct proc            *p = entryp->procp;
2069         struct vnode           *vp;
2070         struct fileproc        *fp;
2071         int                     sync_flag;
2072         int                     error;
2073
2074         /*
2075          * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2076          *
2077          * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2078          * to mark for update the metadata not strictly necessary for data
2079          * retrieval, rather than forcing it to disk.
2080          *
2081          * If AIO_FSYNC is set, we have to also wait for metadata not really
2082          * necessary to data retrival are committed to stable storage (e.g.
2083          * atime, mtime, ctime, etc.).
2084          *
2085          * Metadata necessary for data retrieval ust be committed to stable
2086          * storage in either case (file length, etc.).
2087          */
2088         if (entryp->flags & AIO_FSYNC) {
2089                 sync_flag = MNT_WAIT;
2090         } else {
2091                 sync_flag = MNT_DWAIT;
2092         }
2093
2094         error = fp_get_ftype(p, entryp->aiocb.aio_fildes, DTYPE_VNODE, ENOTSUP, &fp);
2095         if (error != 0) {
2096                 entryp->returnval = -1;
2097                 return error;
2098         }
2099         vp = fp->fp_glob->fg_data;
2100
2101         if ((error = vnode_getwithref(vp)) == 0) {
2102                 struct vfs_context context = {
2103                         .vc_thread = entryp->thread,     /* XXX */
2104                         .vc_ucred = fp->fp_glob->fg_cred,
2105                 };
2106
2107                 error = VNOP_FSYNC(vp, sync_flag, &context);
2108
2109                 (void)vnode_put(vp);
2110         } else {
2111                 entryp->returnval = -1;
2112         }
2113
2114         fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2115         return error;
2116 }
2117
2118
2119 /*
2120  * is_already_queued - runs through our queues to see if the given
2121  * aiocbp / process is there.  Returns TRUE if there is a match
2122  * on any of our aio queues.
2123  *
2124  * Called with proc aio lock held (can be held spin)
2125  */
2126 static boolean_t
2127 is_already_queued(proc_t procp, user_addr_t aiocbp)
2128 {
2129         aio_workq_entry *entryp;
2130         boolean_t        result;
2131
2132         result = FALSE;
2133
2134         /* look for matches on our queue of async IO requests that have completed */
2135         TAILQ_FOREACH(entryp, &procp->p_aio_doneq, aio_proc_link) {
2136                 if (aiocbp == entryp->uaiocbp) {
2137                         result = TRUE;
2138                         goto ExitThisRoutine;
2139                 }
2140         }
2141
2142         /* look for matches on our queue of active async IO requests */
2143         TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2144                 if (aiocbp == entryp->uaiocbp) {
2145                         result = TRUE;
2146                         goto ExitThisRoutine;
2147                 }
2148         }
2149
2150 ExitThisRoutine:
2151         return result;
2152 }
2153
2154
2155 /*
2156  * aio initialization
2157  */
2158 __private_extern__ void
2159 aio_init(void)
2160 {
2161         for (int i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2162                 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2163         }
2164
2165         _aio_create_worker_threads(aio_worker_threads);
2166 }
2167
2168
2169 /*
2170  * aio worker threads created here.
2171  */
2172 __private_extern__ void
2173 _aio_create_worker_threads(int num)
2174 {
2175         int i;
2176
2177         /* create some worker threads to handle the async IO requests */
2178         for (i = 0; i < num; i++) {
2179                 thread_t                myThread;
2180
2181                 if (KERN_SUCCESS != kernel_thread_start(aio_work_thread, NULL, &myThread)) {
2182                         printf("%s - failed to create a work thread \n", __FUNCTION__);
2183                 } else {
2184                         thread_deallocate(myThread);
2185                 }
2186         }
2187 }
2188
2189 /*
2190  * Return the current activation utask
2191  */
2192 task_t
2193 get_aiotask(void)
2194 {
2195         return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2196 }
2197
2198
2199 /*
2200  * In the case of an aiocb from a
2201  * 32-bit process we need to expand some longs and pointers to the correct
2202  * sizes in order to let downstream code always work on the same type of
2203  * aiocb (in our case that is a user_aiocb)
2204  */
2205 static void
2206 do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2207 {
2208         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2209         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2210         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2211         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2212         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2213         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2214
2215         /* special case here.  since we do not know if sigev_value is an */
2216         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2217         /* means if we send this info back to user space we need to remember */
2218         /* sigev_value was not expanded for the 32-bit case.  */
2219         /* NOTE - this does NOT affect us since we don't support sigev_value */
2220         /* yet in the aio context.  */
2221         //LP64
2222         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2223         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2224         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2225             my_aiocbp->aio_sigevent.sigev_value.sival_int;
2226         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2227             CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2228         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2229             CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2230 }
2231
2232 /* Similar for 64-bit user process, so that we don't need to satisfy
2233  * the alignment constraints of the original user64_aiocb
2234  */
2235 #if !__LP64__
2236 __dead2
2237 #endif
2238 static void
2239 do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2240 {
2241 #if __LP64__
2242         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2243         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2244         the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2245         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2246         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2247         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2248
2249         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2250         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2251         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2252             my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2253         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2254             my_aiocbp->aio_sigevent.sigev_notify_function;
2255         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2256             my_aiocbp->aio_sigevent.sigev_notify_attributes;
2257 #else
2258 #pragma unused(my_aiocbp, the_user_aiocbp)
2259         panic("64bit process on 32bit kernel is not supported");
2260 #endif
2261 }