bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29
  30 /*
  31  * todo:
  32  *              1) ramesh is looking into how to replace taking a reference on
  33  *                      the user's map (vm_map_reference()) since it is believed that
  34  *                      would not hold the process for us.
  35  *              2) david is looking into a way for us to set the priority of the
  36  *                      worker threads to match that of the user's thread when the
  37  *                      async IO was queued.
  38  */
  39
  40
  41 /*
  42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  43  */
  44
  45 #include <sys/systm.h>
  46 #include <sys/fcntl.h>
  47 #include <sys/file_internal.h>
  48 #include <sys/filedesc.h>
  49 #include <sys/kernel.h>
  50 #include <sys/vnode_internal.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount_internal.h>
  53 #include <sys/param.h>
  54 #include <sys/proc_internal.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/unistd.h>
  57 #include <sys/user.h>
  58
  59 #include <sys/aio_kern.h>
  60 #include <sys/sysproto.h>
  61
  62 #include <machine/limits.h>
  63
  64 #include <mach/mach_types.h>
  65 #include <kern/kern_types.h>
  66 #include <kern/waitq.h>
  67 #include <kern/zalloc.h>
  68 #include <kern/task.h>
  69 #include <kern/sched_prim.h>
  70
  71 #include <vm/vm_map.h>
  72
  73 #include <libkern/OSAtomic.h>
  74
  75 #include <sys/kdebug.h>
  76 #define AIO_work_queued                                 1
  77 #define AIO_worker_wake                                 2
  78 #define AIO_completion_sig                              3
  79 #define AIO_completion_cleanup_wait             4
  80 #define AIO_completion_cleanup_wake             5
  81 #define AIO_completion_suspend_wake     6
  82 #define AIO_fsync_delay                                 7
  83 #define AIO_cancel                                              10
  84 #define AIO_cancel_async_workq                  11
  85 #define AIO_cancel_sync_workq                   12
  86 #define AIO_cancel_activeq                              13
  87 #define AIO_cancel_doneq                                14
  88 #define AIO_fsync                                               20
  89 #define AIO_read                                                30
  90 #define AIO_write                                               40
  91 #define AIO_listio                                              50
  92 #define AIO_error                                               60
  93 #define AIO_error_val                                   61
  94 #define AIO_error_activeq                               62
  95 #define AIO_error_workq                                 63
  96 #define AIO_return                                              70
  97 #define AIO_return_val                                  71
  98 #define AIO_return_activeq                              72
  99 #define AIO_return_workq                                73
 100 #define AIO_exec                                                80
 101 #define AIO_exit                                                90
 102 #define AIO_exit_sleep                                  91
 103 #define AIO_close                                               100
 104 #define AIO_close_sleep                                 101
 105 #define AIO_suspend                                             110
 106 #define AIO_suspend_sleep                               111
 107 #define AIO_worker_thread                               120
 108
 109 #if 0
 110 #undef KERNEL_DEBUG
 111 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 112 #endif
 113
 114 /*
 115  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 116  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 117  * (proc.aio_activeq) when one of our worker threads start the IO.
 118  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 119  * when the IO request completes.  The request remains on aio_doneq until
 120  * user process calls aio_return or the process exits, either way that is our
 121  * trigger to release aio resources.
 122  */
 123 typedef struct aio_workq   {
 124         TAILQ_HEAD(, aio_workq_entry)   aioq_entries;
 125         int                             aioq_count;
 126         lck_mtx_t                       aioq_mtx;
 127         struct waitq                    aioq_waitq;
 128 } *aio_workq_t;
 129
 130 #define AIO_NUM_WORK_QUEUES 1
 131 struct aio_anchor_cb {
 132         volatile int32_t        aio_inflight_count;     /* entries that have been taken from a workq */
 133         volatile int32_t        aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 134         volatile int32_t        aio_total_count;        /* total extant entries */
 135
 136         /* Hash table of queues here */
 137         int                     aio_num_workqs;
 138         struct aio_workq        aio_async_workqs[AIO_NUM_WORK_QUEUES];
 139 };
 140 typedef struct aio_anchor_cb aio_anchor_cb;
 141
 142 struct aio_lio_context {
 143         int             io_waiter;
 144         int             io_issued;
 145         int             io_completed;
 146 };
 147 typedef struct aio_lio_context aio_lio_context;
 148
 149
 150 /*
 151  * Notes on aio sleep / wake channels.
 152  * We currently pick a couple fields within the proc structure that will allow
 153  * us sleep channels that currently do not collide with any other kernel routines.
 154  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 155  */
 156 #define AIO_SUSPEND_SLEEP_CHAN  p_aio_active_count
 157 #define AIO_CLEANUP_SLEEP_CHAN  p_aio_total_count
 158
 159 #define ASSERT_AIO_FROM_PROC(aiop, theproc)     \
 160         if ((aiop)->procp != (theproc)) {       \
 161                 panic("AIO on a proc list that does not belong to that proc.\n"); \
 162         }
 163
 164 /*
 165  *  LOCAL PROTOTYPES
 166  */
 167 static void             aio_proc_lock(proc_t procp);
 168 static void             aio_proc_lock_spin(proc_t procp);
 169 static void             aio_proc_unlock(proc_t procp);
 170 static lck_mtx_t*       aio_proc_mutex(proc_t procp);
 171 static void             aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
 172 static void             aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
 173 static int              aio_get_process_count(proc_t procp );
 174 static int              aio_active_requests_for_process(proc_t procp );
 175 static int              aio_proc_active_requests_for_file(proc_t procp, int fd);
 176 static boolean_t        is_already_queued(proc_t procp, user_addr_t aiocbp );
 177 static boolean_t        should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
 178
 179 static void             aio_entry_lock(aio_workq_entry *entryp);
 180 static void             aio_entry_lock_spin(aio_workq_entry *entryp);
 181 static aio_workq_t      aio_entry_workq(aio_workq_entry *entryp);
 182 static lck_mtx_t*       aio_entry_mutex(__unused aio_workq_entry *entryp);
 183 static void             aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 184 static void             aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 185 static void             aio_entry_ref_locked(aio_workq_entry *entryp);
 186 static void             aio_entry_unref_locked(aio_workq_entry *entryp);
 187 static void             aio_entry_ref(aio_workq_entry *entryp);
 188 static void             aio_entry_unref(aio_workq_entry *entryp);
 189 static void             aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
 190     int wait_for_completion, boolean_t disable_notification);
 191 static int              aio_entry_try_workq_remove(aio_workq_entry *entryp);
 192 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 193 static int              aio_free_request(aio_workq_entry *entryp);
 194
 195 static void             aio_workq_init(aio_workq_t wq);
 196 static void             aio_workq_lock_spin(aio_workq_t wq);
 197 static void             aio_workq_unlock(aio_workq_t wq);
 198 static lck_mtx_t*       aio_workq_mutex(aio_workq_t wq);
 199
 200 static void             aio_work_thread( void );
 201 static aio_workq_entry *aio_get_some_work( void );
 202
 203 static int              aio_get_all_queues_count( void );
 204 static int              aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
 205 static int              aio_validate( aio_workq_entry *entryp );
 206 static int              aio_increment_total_count(void);
 207 static int              aio_decrement_total_count(void);
 208
 209 static int              do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
 210 static void             do_aio_completion( aio_workq_entry *entryp );
 211 static int              do_aio_fsync( aio_workq_entry *entryp );
 212 static int              do_aio_read( aio_workq_entry *entryp );
 213 static int              do_aio_write( aio_workq_entry *entryp );
 214 static void             do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 215 static void             do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 216 static int      lio_create_entry(proc_t procp,
 217     user_addr_t aiocbp,
 218     void *group_tag,
 219     aio_workq_entry **entrypp );
 220 static aio_workq_entry *aio_create_queue_entry(proc_t procp,
 221     user_addr_t aiocbp,
 222     void *group_tag,
 223     int kindOfIO);
 224 static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
 225 static void             free_lio_context(aio_lio_context* context);
 226 static void             aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
 227
 228 #define ASSERT_AIO_PROC_LOCK_OWNED(p)   lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
 229 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q)  lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
 230 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e)  lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
 231
 232 /*
 233  *  EXTERNAL PROTOTYPES
 234  */
 235
 236 /* in ...bsd/kern/sys_generic.c */
 237 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
 238     user_addr_t bufp, user_size_t nbyte,
 239     off_t offset, int flags, user_ssize_t *retval );
 240 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 241     user_addr_t bufp, user_size_t nbyte, off_t offset,
 242     int flags, user_ssize_t *retval );
 243 #if DEBUG
 244 static uint32_t                         lio_contexts_alloced = 0;
 245 #endif  /* DEBUG */
 246
 247 /*
 248  * aio external global variables.
 249  */
 250 extern int aio_max_requests;                    /* AIO_MAX - configurable */
 251 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 252 extern int aio_worker_threads;                  /* AIO_THREAD_COUNT - configurable */
 253
 254
 255 /*
 256  * aio static variables.
 257  */
 258 static aio_anchor_cb    aio_anchor;
 259 static lck_grp_t        *aio_proc_lock_grp;
 260 static lck_grp_t        *aio_entry_lock_grp;
 261 static lck_grp_t        *aio_queue_lock_grp;
 262 static lck_attr_t       *aio_lock_attr;
 263 static lck_grp_attr_t   *aio_lock_grp_attr;
 264 static struct zone      *aio_workq_zonep;
 265 static lck_mtx_t        aio_entry_mtx;
 266 static lck_mtx_t        aio_proc_mtx;
 267
 268 static void
 269 aio_entry_lock(__unused aio_workq_entry *entryp)
 270 {
 271         lck_mtx_lock(&aio_entry_mtx);
 272 }
 273
 274 static void
 275 aio_entry_lock_spin(__unused aio_workq_entry *entryp)
 276 {
 277         lck_mtx_lock_spin(&aio_entry_mtx);
 278 }
 279
 280 static void
 281 aio_entry_unlock(__unused aio_workq_entry *entryp)
 282 {
 283         lck_mtx_unlock(&aio_entry_mtx);
 284 }
 285
 286 /* Hash */
 287 static aio_workq_t
 288 aio_entry_workq(__unused aio_workq_entry *entryp)
 289 {
 290         return &aio_anchor.aio_async_workqs[0];
 291 }
 292
 293 static lck_mtx_t*
 294 aio_entry_mutex(__unused aio_workq_entry *entryp)
 295 {
 296         return &aio_entry_mtx;
 297 }
 298
 299 static void
 300 aio_workq_init(aio_workq_t wq)
 301 {
 302         TAILQ_INIT(&wq->aioq_entries);
 303         wq->aioq_count = 0;
 304         lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
 305         waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
 306 }
 307
 308
 309 /*
 310  * Can be passed a queue which is locked spin.
 311  */
 312 static void
 313 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 314 {
 315         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 316
 317         if (entryp->aio_workq_link.tqe_prev == NULL) {
 318                 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
 319         }
 320
 321         TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
 322         queue->aioq_count--;
 323         entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
 324
 325         if (queue->aioq_count < 0) {
 326                 panic("Negative count on a queue.\n");
 327         }
 328 }
 329
 330 static void
 331 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 332 {
 333         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 334
 335         TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
 336         if (queue->aioq_count < 0) {
 337                 panic("Negative count on a queue.\n");
 338         }
 339         queue->aioq_count++;
 340 }
 341
 342 static void
 343 aio_proc_lock(proc_t procp)
 344 {
 345         lck_mtx_lock(aio_proc_mutex(procp));
 346 }
 347
 348 static void
 349 aio_proc_lock_spin(proc_t procp)
 350 {
 351         lck_mtx_lock_spin(aio_proc_mutex(procp));
 352 }
 353
 354 static void
 355 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
 356 {
 357         ASSERT_AIO_PROC_LOCK_OWNED(procp);
 358
 359         TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
 360         TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
 361         procp->p_aio_active_count--;
 362         OSIncrementAtomic(&aio_anchor.aio_done_count);
 363 }
 364
 365 static void
 366 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
 367 {
 368         TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
 369         OSDecrementAtomic(&aio_anchor.aio_done_count);
 370         aio_decrement_total_count();
 371         procp->p_aio_total_count--;
 372 }
 373
 374 static void
 375 aio_proc_unlock(proc_t procp)
 376 {
 377         lck_mtx_unlock(aio_proc_mutex(procp));
 378 }
 379
 380 static lck_mtx_t*
 381 aio_proc_mutex(proc_t procp)
 382 {
 383         return &procp->p_mlock;
 384 }
 385
 386 static void
 387 aio_entry_ref_locked(aio_workq_entry *entryp)
 388 {
 389         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 390
 391         if (entryp->aio_refcount < 0) {
 392                 panic("AIO workq entry with a negative refcount.\n");
 393         }
 394         entryp->aio_refcount++;
 395 }
 396
 397
 398 /* Return 1 if you've freed it */
 399 static void
 400 aio_entry_unref_locked(aio_workq_entry *entryp)
 401 {
 402         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 403
 404         entryp->aio_refcount--;
 405         if (entryp->aio_refcount < 0) {
 406                 panic("AIO workq entry with a negative refcount.\n");
 407         }
 408 }
 409
 410 static void
 411 aio_entry_ref(aio_workq_entry *entryp)
 412 {
 413         aio_entry_lock_spin(entryp);
 414         aio_entry_ref_locked(entryp);
 415         aio_entry_unlock(entryp);
 416 }
 417 static void
 418 aio_entry_unref(aio_workq_entry *entryp)
 419 {
 420         aio_entry_lock_spin(entryp);
 421         aio_entry_unref_locked(entryp);
 422
 423         if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
 424                 aio_entry_unlock(entryp);
 425                 aio_free_request(entryp);
 426         } else {
 427                 aio_entry_unlock(entryp);
 428         }
 429
 430         return;
 431 }
 432
 433 static void
 434 aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
 435 {
 436         aio_entry_lock_spin(entryp);
 437
 438         if (cancelled) {
 439                 aio_entry_ref_locked(entryp);
 440                 entryp->errorval = ECANCELED;
 441                 entryp->returnval = -1;
 442         }
 443
 444         if (wait_for_completion) {
 445                 entryp->flags |= wait_for_completion; /* flag for special completion processing */
 446         }
 447
 448         if (disable_notification) {
 449                 entryp->flags |= AIO_DISABLE; /* Don't want a signal */
 450         }
 451
 452         aio_entry_unlock(entryp);
 453 }
 454
 455 static int
 456 aio_entry_try_workq_remove(aio_workq_entry *entryp)
 457 {
 458         /* Can only be cancelled if it's still on a work queue */
 459         if (entryp->aio_workq_link.tqe_prev != NULL) {
 460                 aio_workq_t queue;
 461
 462                 /* Will have to check again under the lock */
 463                 queue = aio_entry_workq(entryp);
 464                 aio_workq_lock_spin(queue);
 465                 if (entryp->aio_workq_link.tqe_prev != NULL) {
 466                         aio_workq_remove_entry_locked(queue, entryp);
 467                         aio_workq_unlock(queue);
 468                         return 1;
 469                 } else {
 470                         aio_workq_unlock(queue);
 471                 }
 472         }
 473
 474         return 0;
 475 }
 476
 477 static void
 478 aio_workq_lock_spin(aio_workq_t wq)
 479 {
 480         lck_mtx_lock_spin(aio_workq_mutex(wq));
 481 }
 482
 483 static void
 484 aio_workq_unlock(aio_workq_t wq)
 485 {
 486         lck_mtx_unlock(aio_workq_mutex(wq));
 487 }
 488
 489 static lck_mtx_t*
 490 aio_workq_mutex(aio_workq_t wq)
 491 {
 492         return &wq->aioq_mtx;
 493 }
 494
 495 /*
 496  * aio_cancel - attempt to cancel one or more async IO requests currently
 497  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 498  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 499  * is NULL then all outstanding async IO request for the given file
 500  * descriptor are cancelled (if possible).
 501  */
 502 int
 503 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
 504 {
 505         struct user_aiocb               my_aiocb;
 506         int                                                     result;
 507
 508         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 509             (int)p, (int)uap->aiocbp, 0, 0, 0 );
 510
 511         /* quick check to see if there are any async IO requests queued up */
 512         if (aio_get_all_queues_count() < 1) {
 513                 result = 0;
 514                 *retval = AIO_ALLDONE;
 515                 goto ExitRoutine;
 516         }
 517
 518         *retval = -1;
 519         if (uap->aiocbp != USER_ADDR_NULL) {
 520                 if (proc_is64bit(p)) {
 521                         struct user64_aiocb aiocb64;
 522
 523                         result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64));
 524                         if (result == 0) {
 525                                 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
 526                         }
 527                 } else {
 528                         struct user32_aiocb aiocb32;
 529
 530                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32));
 531                         if (result == 0) {
 532                                 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
 533                         }
 534                 }
 535
 536                 if (result != 0) {
 537                         result = EAGAIN;
 538                         goto ExitRoutine;
 539                 }
 540
 541                 /* NOTE - POSIX standard says a mismatch between the file */
 542                 /* descriptor passed in and the file descriptor embedded in */
 543                 /* the aiocb causes unspecified results.  We return EBADF in */
 544                 /* that situation.  */
 545                 if (uap->fd != my_aiocb.aio_fildes) {
 546                         result = EBADF;
 547                         goto ExitRoutine;
 548                 }
 549         }
 550
 551         aio_proc_lock(p);
 552         result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
 553         ASSERT_AIO_PROC_LOCK_OWNED(p);
 554         aio_proc_unlock(p);
 555
 556         if (result != -1) {
 557                 *retval = result;
 558                 result = 0;
 559                 goto ExitRoutine;
 560         }
 561
 562         result = EBADF;
 563
 564 ExitRoutine:
 565         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 566             (int)p, (int)uap->aiocbp, result, 0, 0 );
 567
 568         return result;
 569 } /* aio_cancel */
 570
 571
 572 /*
 573  * _aio_close - internal function used to clean up async IO requests for
 574  * a file descriptor that is closing.
 575  * THIS MAY BLOCK.
 576  */
 577 __private_extern__ void
 578 _aio_close(proc_t p, int fd )
 579 {
 580         int                     error;
 581
 582         /* quick check to see if there are any async IO requests queued up */
 583         if (aio_get_all_queues_count() < 1) {
 584                 return;
 585         }
 586
 587         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 588             (int)p, fd, 0, 0, 0 );
 589
 590         /* cancel all async IO requests on our todo queues for this file descriptor */
 591         aio_proc_lock(p);
 592         error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
 593         ASSERT_AIO_PROC_LOCK_OWNED(p);
 594         if (error == AIO_NOTCANCELED) {
 595                 /*
 596                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 597                  * and file descriptor on the active async IO queue.  Active requests cannot
 598                  * be cancelled so we must wait for them to complete.  We will get a special
 599                  * wake up call on our channel used to sleep for ALL active requests to
 600                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 601                  * when we must wait for all active aio requests.
 602                  */
 603
 604                 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 605                     (int)p, fd, 0, 0, 0 );
 606
 607                 while (aio_proc_active_requests_for_file(p, fd) > 0) {
 608                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 );
 609                 }
 610         }
 611
 612         aio_proc_unlock(p);
 613
 614         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 615             (int)p, fd, 0, 0, 0 );
 616
 617         return;
 618 } /* _aio_close */
 619
 620
 621 /*
 622  * aio_error - return the error status associated with the async IO
 623  * request referred to by uap->aiocbp.  The error status is the errno
 624  * value that would be set by the corresponding IO request (read, wrtie,
 625  * fdatasync, or sync).
 626  */
 627 int
 628 aio_error(proc_t p, struct aio_error_args *uap, int *retval )
 629 {
 630         aio_workq_entry                         *entryp;
 631         int                                                     error;
 632
 633         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 634             (int)p, (int)uap->aiocbp, 0, 0, 0 );
 635
 636         /* see if there are any aios to check */
 637         if (aio_get_all_queues_count() < 1) {
 638                 return EINVAL;
 639         }
 640
 641         aio_proc_lock(p);
 642
 643         /* look for a match on our queue of async IO requests that have completed */
 644         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 645                 if (entryp->uaiocbp == uap->aiocbp) {
 646                         ASSERT_AIO_FROM_PROC(entryp, p);
 647
 648                         aio_entry_lock_spin(entryp);
 649                         *retval = entryp->errorval;
 650                         error = 0;
 651                         aio_entry_unlock(entryp);
 652                         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 653                             (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 654                         goto ExitRoutine;
 655                 }
 656         }
 657
 658         /* look for a match on our queue of active async IO requests */
 659         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 660                 if (entryp->uaiocbp == uap->aiocbp) {
 661                         ASSERT_AIO_FROM_PROC(entryp, p);
 662                         *retval = EINPROGRESS;
 663                         error = 0;
 664                         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 665                             (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 666                         goto ExitRoutine;
 667                 }
 668         }
 669
 670         error = EINVAL;
 671
 672 ExitRoutine:
 673         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 674             (int)p, (int)uap->aiocbp, error, 0, 0 );
 675         aio_proc_unlock(p);
 676
 677         return error;
 678 } /* aio_error */
 679
 680
 681 /*
 682  * aio_fsync - asynchronously force all IO operations associated
 683  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 684  * queued at the time of the call to the synchronized completion state.
 685  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 686  * fdatasync() call.
 687  */
 688 int
 689 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
 690 {
 691         int                     error;
 692         int                     fsync_kind;
 693
 694         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 695             (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 696
 697         *retval = 0;
 698         /* 0 := O_SYNC for binary backward compatibility with Panther */
 699         if (uap->op == O_SYNC || uap->op == 0) {
 700                 fsync_kind = AIO_FSYNC;
 701         } else if (uap->op == O_DSYNC) {
 702                 fsync_kind = AIO_DSYNC;
 703         } else {
 704                 *retval = -1;
 705                 error = EINVAL;
 706                 goto ExitRoutine;
 707         }
 708
 709         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 710         if (error != 0) {
 711                 *retval = -1;
 712         }
 713
 714 ExitRoutine:
 715         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 716             (int)p, (int)uap->aiocbp, error, 0, 0 );
 717
 718         return error;
 719 } /* aio_fsync */
 720
 721
 722 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 723  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 724  * (uap->aiocbp->aio_buf).
 725  */
 726 int
 727 aio_read(proc_t p, struct aio_read_args *uap, int *retval )
 728 {
 729         int                     error;
 730
 731         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 732             (int)p, (int)uap->aiocbp, 0, 0, 0 );
 733
 734         *retval = 0;
 735
 736         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 737         if (error != 0) {
 738                 *retval = -1;
 739         }
 740
 741         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 742             (int)p, (int)uap->aiocbp, error, 0, 0 );
 743
 744         return error;
 745 } /* aio_read */
 746
 747
 748 /*
 749  * aio_return - return the return status associated with the async IO
 750  * request referred to by uap->aiocbp.  The return status is the value
 751  * that would be returned by corresponding IO request (read, write,
 752  * fdatasync, or sync).  This is where we release kernel resources
 753  * held for async IO call associated with the given aiocb pointer.
 754  */
 755 int
 756 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
 757 {
 758         aio_workq_entry                         *entryp;
 759         int                                                     error;
 760         boolean_t                                       proc_lock_held = FALSE;
 761
 762         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 763             (int)p, (int)uap->aiocbp, 0, 0, 0 );
 764
 765         /* See if there are any entries to check */
 766         if (aio_get_all_queues_count() < 1) {
 767                 error = EINVAL;
 768                 goto ExitRoutine;
 769         }
 770
 771         aio_proc_lock(p);
 772         proc_lock_held = TRUE;
 773         *retval = 0;
 774
 775         /* look for a match on our queue of async IO requests that have completed */
 776         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 777                 ASSERT_AIO_FROM_PROC(entryp, p);
 778                 if (entryp->uaiocbp == uap->aiocbp) {
 779                         /* Done and valid for aio_return(), pull it off the list */
 780                         aio_proc_remove_done_locked(p, entryp);
 781
 782                         /* Drop the proc lock, but keep the entry locked */
 783                         aio_entry_lock(entryp);
 784                         aio_proc_unlock(p);
 785                         proc_lock_held = FALSE;
 786
 787                         *retval = entryp->returnval;
 788                         error = 0;
 789
 790                         /* No references and off all lists, safe to free */
 791                         if (entryp->aio_refcount == 0) {
 792                                 aio_entry_unlock(entryp);
 793                                 aio_free_request(entryp);
 794                         } else {
 795                                 /* Whoever has the refcount will have to free it */
 796                                 entryp->flags |= AIO_DO_FREE;
 797                                 aio_entry_unlock(entryp);
 798                         }
 799
 800
 801                         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 802                             (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 803                         goto ExitRoutine;
 804                 }
 805         }
 806
 807         /* look for a match on our queue of active async IO requests */
 808         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 809                 ASSERT_AIO_FROM_PROC(entryp, p);
 810                 if (entryp->uaiocbp == uap->aiocbp) {
 811                         error = EINPROGRESS;
 812                         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 813                             (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 814                         goto ExitRoutine;
 815                 }
 816         }
 817
 818         error = EINVAL;
 819
 820 ExitRoutine:
 821         if (proc_lock_held) {
 822                 aio_proc_unlock(p);
 823         }
 824         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 825             (int)p, (int)uap->aiocbp, error, 0, 0 );
 826
 827         return error;
 828 } /* aio_return */
 829
 830
 831 /*
 832  * _aio_exec - internal function used to clean up async IO requests for
 833  * a process that is going away due to exec().  We cancel any async IOs
 834  * we can and wait for those already active.  We also disable signaling
 835  * for cancelled or active aio requests that complete.
 836  * This routine MAY block!
 837  */
 838 __private_extern__ void
 839 _aio_exec(proc_t p )
 840 {
 841         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 842             (int)p, 0, 0, 0, 0 );
 843
 844         _aio_exit( p );
 845
 846         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 847             (int)p, 0, 0, 0, 0 );
 848
 849         return;
 850 } /* _aio_exec */
 851
 852
 853 /*
 854  * _aio_exit - internal function used to clean up async IO requests for
 855  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 856  * we can and wait for those already active.  We also disable signaling
 857  * for cancelled or active aio requests that complete.  This routine MAY block!
 858  */
 859 __private_extern__ void
 860 _aio_exit(proc_t p )
 861 {
 862         int                                             error;
 863         aio_workq_entry                 *entryp;
 864
 865
 866         /* quick check to see if there are any async IO requests queued up */
 867         if (aio_get_all_queues_count() < 1) {
 868                 return;
 869         }
 870
 871         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 872             (int)p, 0, 0, 0, 0 );
 873
 874         aio_proc_lock(p);
 875
 876         /*
 877          * cancel async IO requests on the todo work queue and wait for those
 878          * already active to complete.
 879          */
 880         error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
 881         ASSERT_AIO_PROC_LOCK_OWNED(p);
 882         if (error == AIO_NOTCANCELED) {
 883                 /*
 884                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 885                  * on the active async IO queue.  Active requests cannot be cancelled so we
 886                  * must wait for them to complete.  We will get a special wake up call on
 887                  * our channel used to sleep for ALL active requests to complete.  This sleep
 888                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 889                  * active aio requests.
 890                  */
 891
 892                 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 893                     (int)p, 0, 0, 0, 0 );
 894
 895                 while (p->p_aio_active_count != 0) {
 896                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
 897                 }
 898         }
 899
 900         if (p->p_aio_active_count != 0) {
 901                 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
 902         }
 903
 904         /* release all aio resources used by this process */
 905         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 906         while (entryp != NULL) {
 907                 ASSERT_AIO_FROM_PROC(entryp, p);
 908                 aio_workq_entry                 *next_entryp;
 909
 910                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 911                 aio_proc_remove_done_locked(p, entryp);
 912
 913                 /* we cannot free requests that are still completing */
 914                 aio_entry_lock_spin(entryp);
 915                 if (entryp->aio_refcount == 0) {
 916                         aio_proc_unlock(p);
 917                         aio_entry_unlock(entryp);
 918                         aio_free_request(entryp);
 919
 920                         /* need to start over since aio_doneq may have been */
 921                         /* changed while we were away.  */
 922                         aio_proc_lock(p);
 923                         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 924                         continue;
 925                 } else {
 926                         /* whoever has the reference will have to do the free */
 927                         entryp->flags |= AIO_DO_FREE;
 928                 }
 929
 930                 aio_entry_unlock(entryp);
 931                 entryp = next_entryp;
 932         }
 933
 934         aio_proc_unlock(p);
 935
 936         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 937             (int)p, 0, 0, 0, 0 );
 938         return;
 939 } /* _aio_exit */
 940
 941
 942 static boolean_t
 943 should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
 944 {
 945         if ((aiocbp == USER_ADDR_NULL && fd == 0) ||
 946             (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 947             (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes)) {
 948                 return TRUE;
 949         }
 950
 951         return FALSE;
 952 }
 953
 954 /*
 955  * do_aio_cancel_locked - cancel async IO requests (if possible).  We get called by
 956  * aio_cancel, close, and at exit.
 957  * There are three modes of operation: 1) cancel all async IOs for a process -
 958  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 959  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 960  * aiocbp.
 961  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 962  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 963  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 964  * were already complete.
 965  * WARNING - do not deference aiocbp in this routine, it may point to user
 966  * land data that has not been copied in (when called from aio_cancel() )
 967  *
 968  * Called with proc locked, and returns the same way.
 969  */
 970 static int
 971 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
 972     int wait_for_completion, boolean_t disable_notification )
 973 {
 974         ASSERT_AIO_PROC_LOCK_OWNED(p);
 975
 976         aio_workq_entry                 *entryp;
 977         int                                             result;
 978
 979         result = -1;
 980
 981         /* look for a match on our queue of async todo work. */
 982         entryp = TAILQ_FIRST(&p->p_aio_activeq);
 983         while (entryp != NULL) {
 984                 ASSERT_AIO_FROM_PROC(entryp, p);
 985                 aio_workq_entry                 *next_entryp;
 986
 987                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 988                 if (!should_cancel(entryp, aiocbp, fd)) {
 989                         entryp = next_entryp;
 990                         continue;
 991                 }
 992
 993                 /* Can only be cancelled if it's still on a work queue */
 994                 if (aio_entry_try_workq_remove(entryp) != 0) {
 995                         /* Have removed from workq. Update entry state and take a ref */
 996                         aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
 997
 998                         /* Put on the proc done queue and update counts, then unlock the proc */
 999                         aio_proc_move_done_locked(p, entryp);
1000                         aio_proc_unlock(p);
1001
1002                         /* Now it's officially cancelled.  Do the completion */
1003                         result = AIO_CANCELED;
1004                         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1005                             (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1006                         do_aio_completion(entryp);
1007
1008                         /* This will free if the aio_return() has already happened ... */
1009                         aio_entry_unref(entryp);
1010                         aio_proc_lock(p);
1011
1012                         if (aiocbp != USER_ADDR_NULL) {
1013                                 return result;
1014                         }
1015
1016                         /*
1017                          * Restart from the head of the proc active queue since it
1018                          * may have been changed while we were away doing completion
1019                          * processing.
1020                          *
1021                          * Note that if we found an uncancellable AIO before, we will
1022                          * either find it again or discover that it's been completed,
1023                          * so resetting the result will not cause us to return success
1024                          * despite outstanding AIOs.
1025                          */
1026                         entryp = TAILQ_FIRST(&p->p_aio_activeq);
1027                         result = -1; /* As if beginning anew */
1028                 } else {
1029                         /*
1030                          * It's been taken off the active queue already, i.e. is in flight.
1031                          * All we can do is ask for notification.
1032                          */
1033                         result = AIO_NOTCANCELED;
1034
1035                         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
1036                             (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1037
1038                         /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1039                         aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
1040
1041                         if (aiocbp != USER_ADDR_NULL) {
1042                                 return result;
1043                         }
1044                         entryp = next_entryp;
1045                 }
1046         } /* while... */
1047
1048         /*
1049          * if we didn't find any matches on the todo or active queues then look for a
1050          * match on our queue of async IO requests that have completed and if found
1051          * return AIO_ALLDONE result.
1052          *
1053          * Proc AIO lock is still held.
1054          */
1055         if (result == -1) {
1056                 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1057                         ASSERT_AIO_FROM_PROC(entryp, p);
1058                         if (should_cancel(entryp, aiocbp, fd)) {
1059                                 result = AIO_ALLDONE;
1060                                 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
1061                                     (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1062
1063                                 if (aiocbp != USER_ADDR_NULL) {
1064                                         return result;
1065                                 }
1066                         }
1067                 }
1068         }
1069
1070         return result;
1071 }
1072 /* do_aio_cancel_locked */
1073
1074
1075 /*
1076  * aio_suspend - suspend the calling thread until at least one of the async
1077  * IO operations referenced by uap->aiocblist has completed, until a signal
1078  * interrupts the function, or uap->timeoutp time interval (optional) has
1079  * passed.
1080  * Returns 0 if one or more async IOs have completed else -1 and errno is
1081  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1082  * woke us up.
1083  */
1084 int
1085 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1086 {
1087         __pthread_testcancel(1);
1088         return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
1089 }
1090
1091
1092 int
1093 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
1094 {
1095         int                                     error;
1096         int                                     i, count;
1097         uint64_t                        abstime;
1098         struct user_timespec ts;
1099         aio_workq_entry         *entryp;
1100         user_addr_t                     *aiocbpp;
1101
1102         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1103             (int)p, uap->nent, 0, 0, 0 );
1104
1105         *retval = -1;
1106         abstime = 0;
1107         aiocbpp = NULL;
1108
1109         count = aio_get_all_queues_count();
1110         if (count < 1) {
1111                 error = EINVAL;
1112                 goto ExitThisRoutine;
1113         }
1114
1115         if (uap->nent < 1 || uap->nent > aio_max_requests_per_process) {
1116                 error = EINVAL;
1117                 goto ExitThisRoutine;
1118         }
1119
1120         if (uap->timeoutp != USER_ADDR_NULL) {
1121                 if (proc_is64bit(p)) {
1122                         struct user64_timespec temp;
1123                         error = copyin( uap->timeoutp, &temp, sizeof(temp));
1124                         if (error == 0) {
1125                                 ts.tv_sec = temp.tv_sec;
1126                                 ts.tv_nsec = temp.tv_nsec;
1127                         }
1128                 } else {
1129                         struct user32_timespec temp;
1130                         error = copyin( uap->timeoutp, &temp, sizeof(temp));
1131                         if (error == 0) {
1132                                 ts.tv_sec = temp.tv_sec;
1133                                 ts.tv_nsec = temp.tv_nsec;
1134                         }
1135                 }
1136                 if (error != 0) {
1137                         error = EAGAIN;
1138                         goto ExitThisRoutine;
1139                 }
1140
1141                 if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
1142                         error = EINVAL;
1143                         goto ExitThisRoutine;
1144                 }
1145
1146                 nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1147                     &abstime );
1148                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
1149         }
1150
1151         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1152         if (aiocbpp == NULL) {
1153                 error = EAGAIN;
1154                 goto ExitThisRoutine;
1155         }
1156
1157         /* check list of aio requests to see if any have completed */
1158 check_for_our_aiocbp:
1159         aio_proc_lock_spin(p);
1160         for (i = 0; i < uap->nent; i++) {
1161                 user_addr_t     aiocbp;
1162
1163                 /* NULL elements are legal so check for 'em */
1164                 aiocbp = *(aiocbpp + i);
1165                 if (aiocbp == USER_ADDR_NULL) {
1166                         continue;
1167                 }
1168
1169                 /* return immediately if any aio request in the list is done */
1170                 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1171                         ASSERT_AIO_FROM_PROC(entryp, p);
1172                         if (entryp->uaiocbp == aiocbp) {
1173                                 aio_proc_unlock(p);
1174                                 *retval = 0;
1175                                 error = 0;
1176                                 goto ExitThisRoutine;
1177                         }
1178                 }
1179         } /* for ( ; i < uap->nent; ) */
1180
1181         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1182             (int)p, uap->nent, 0, 0, 0 );
1183
1184         /*
1185          * wait for an async IO to complete or a signal fires or timeout expires.
1186          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1187          * interrupts us.  If an async IO completes before a signal fires or our
1188          * timeout expires, we get a wakeup call from aio_work_thread().
1189          */
1190
1191         error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
1192         if (error == 0) {
1193                 /*
1194                  * got our wakeup call from aio_work_thread().
1195                  * Since we can get a wakeup on this channel from another thread in the
1196                  * same process we head back up to make sure this is for the correct aiocbp.
1197                  * If it is the correct aiocbp we will return from where we do the check
1198                  * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1199                  * else we will fall out and just sleep again.
1200                  */
1201                 goto check_for_our_aiocbp;
1202         } else if (error == EWOULDBLOCK) {
1203                 /* our timeout expired */
1204                 error = EAGAIN;
1205         } else {
1206                 /* we were interrupted */
1207                 error = EINTR;
1208         }
1209
1210 ExitThisRoutine:
1211         if (aiocbpp != NULL) {
1212                 FREE( aiocbpp, M_TEMP );
1213         }
1214
1215         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1216             (int)p, uap->nent, error, 0, 0 );
1217
1218         return error;
1219 } /* aio_suspend */
1220
1221
1222 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1223  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1224  * (uap->aiocbp->aio_buf).
1225  */
1226
1227 int
1228 aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1229 {
1230         int                     error;
1231
1232         *retval = 0;
1233
1234         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1235             (int)p, (int)uap->aiocbp, 0, 0, 0 );
1236
1237         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1238         if (error != 0) {
1239                 *retval = -1;
1240         }
1241
1242         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1243             (int)p, (int)uap->aiocbp, error, 0, 0 );
1244
1245         return error;
1246 } /* aio_write */
1247
1248
1249 static user_addr_t *
1250 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
1251 {
1252         user_addr_t     *aiocbpp;
1253         int             i, result;
1254
1255         /* we reserve enough space for largest possible pointer size */
1256         MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1257         if (aiocbpp == NULL) {
1258                 goto err;
1259         }
1260
1261         /* copyin our aiocb pointers from list */
1262         result = copyin( aiocblist, aiocbpp,
1263             proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1264             : (nent * sizeof(user32_addr_t)));
1265         if (result) {
1266                 FREE( aiocbpp, M_TEMP );
1267                 aiocbpp = NULL;
1268                 goto err;
1269         }
1270
1271         /*
1272          * We depend on a list of user_addr_t's so we need to
1273          * munge and expand when these pointers came from a
1274          * 32-bit process
1275          */
1276         if (!proc_is64bit(procp)) {
1277                 /* copy from last to first to deal with overlap */
1278                 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1279                 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1280
1281                 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1282                         *my_addrp = (user_addr_t) (*my_ptrp);
1283                 }
1284         }
1285
1286 err:
1287         return aiocbpp;
1288 }
1289
1290
1291 static int
1292 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1293 {
1294         int     result = 0;
1295
1296         if (sigp == USER_ADDR_NULL) {
1297                 goto out;
1298         }
1299
1300         /*
1301          * We need to munge aio_sigevent since it contains pointers.
1302          * Since we do not know if sigev_value is an int or a ptr we do
1303          * NOT cast the ptr to a user_addr_t.   This means if we send
1304          * this info back to user space we need to remember sigev_value
1305          * was not expanded for the 32-bit case.
1306          *
1307          * Notes:        This does NOT affect us since we don't support
1308          *              sigev_value yet in the aio context.
1309          */
1310         if (proc_is64bit(procp)) {
1311                 struct user64_sigevent sigevent64;
1312
1313                 result = copyin( sigp, &sigevent64, sizeof(sigevent64));
1314                 if (result == 0) {
1315                         sigev->sigev_notify = sigevent64.sigev_notify;
1316                         sigev->sigev_signo = sigevent64.sigev_signo;
1317                         sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1318                         sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1319                         sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1320                 }
1321         } else {
1322                 struct user32_sigevent sigevent32;
1323
1324                 result = copyin( sigp, &sigevent32, sizeof(sigevent32));
1325                 if (result == 0) {
1326                         sigev->sigev_notify = sigevent32.sigev_notify;
1327                         sigev->sigev_signo = sigevent32.sigev_signo;
1328                         sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1329                         sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1330                         sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1331                 }
1332         }
1333
1334         if (result != 0) {
1335                 result = EAGAIN;
1336         }
1337
1338 out:
1339         return result;
1340 }
1341
1342 /*
1343  * validate user_sigevent.  at this point we only support
1344  * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
1345  * sigev_value, sigev_notify_function, and sigev_notify_attributes
1346  * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
1347  * with no [RTS] (RalTime Signal) option group support.
1348  */
1349 static int
1350 aio_sigev_validate( const struct user_sigevent *sigev )
1351 {
1352         switch (sigev->sigev_notify) {
1353         case SIGEV_SIGNAL:
1354         {
1355                 int signum;
1356
1357                 /* make sure we have a valid signal number */
1358                 signum = sigev->sigev_signo;
1359                 if (signum <= 0 || signum >= NSIG ||
1360                     signum == SIGKILL || signum == SIGSTOP) {
1361                         return EINVAL;
1362                 }
1363         }
1364         break;
1365
1366         case SIGEV_NONE:
1367                 break;
1368
1369         case SIGEV_THREAD:
1370         /* Unsupported [RTS] */
1371
1372         default:
1373                 return EINVAL;
1374         }
1375
1376         return 0;
1377 }
1378
1379
1380 /*
1381  * aio_enqueue_work
1382  *
1383  * Queue up the entry on the aio asynchronous work queue in priority order
1384  * based on the relative priority of the request.  We calculate the relative
1385  * priority using the nice value of the caller and the value
1386  *
1387  * Parameters:  procp                   Process queueing the I/O
1388  *              entryp                  The work queue entry being queued
1389  *
1390  * Returns:     (void)                  No failure modes
1391  *
1392  * Notes:       This function is used for both lio_listio and aio
1393  *
1394  * XXX:         At some point, we may have to consider thread priority
1395  *              rather than process priority, but we don't maintain the
1396  *              adjusted priority for threads the POSIX way.
1397  *
1398  *
1399  * Called with proc locked.
1400  */
1401 static void
1402 aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1403 {
1404 #if 0
1405         aio_workq_entry *my_entryp;     /* used for insertion sort */
1406 #endif /* 0 */
1407         aio_workq_t queue = aio_entry_workq(entryp);
1408
1409         if (proc_locked == 0) {
1410                 aio_proc_lock(procp);
1411         }
1412
1413         ASSERT_AIO_PROC_LOCK_OWNED(procp);
1414
1415         /* Onto proc queue */
1416         TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
1417         procp->p_aio_active_count++;
1418         procp->p_aio_total_count++;
1419
1420         /* And work queue */
1421         aio_workq_lock_spin(queue);
1422         aio_workq_add_entry_locked(queue, entryp);
1423         waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1424             THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
1425         aio_workq_unlock(queue);
1426
1427         if (proc_locked == 0) {
1428                 aio_proc_unlock(procp);
1429         }
1430
1431 #if 0
1432         /*
1433          * Procedure:
1434          *
1435          * (1)  The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1436          * (2)  The normalized nice value is in the range 0..((2 * NZERO) - 1)
1437          *      which is [0..39], with 0 not being used.  In nice values, the
1438          *      lower the nice value, the higher the priority.
1439          * (3)  The normalized scheduling prioritiy is the highest nice value
1440          *      minus the current nice value.  In I/O scheduling priority, the
1441          *      higher the value the lower the priority, so it is the inverse
1442          *      of the nice value (the higher the number, the higher the I/O
1443          *      priority).
1444          * (4)  From the normalized scheduling priority, we subtract the
1445          *      request priority to get the request priority value number;
1446          *      this means that requests are only capable of depressing their
1447          *      priority relative to other requests,
1448          */
1449         entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1450
1451         /* only premit depressing the priority */
1452         if (entryp->aiocb.aio_reqprio < 0) {
1453                 entryp->aiocb.aio_reqprio = 0;
1454         }
1455         if (entryp->aiocb.aio_reqprio > 0) {
1456                 entryp->priority -= entryp->aiocb.aio_reqprio;
1457                 if (entryp->priority < 0) {
1458                         entryp->priority = 0;
1459                 }
1460         }
1461
1462         /* Insertion sort the entry; lowest ->priority to highest */
1463         TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1464                 if (entryp->priority <= my_entryp->priority) {
1465                         TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1466                         break;
1467                 }
1468         }
1469         if (my_entryp == NULL) {
1470                 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1471         }
1472 #endif /* 0 */
1473 }
1474
1475
1476 /*
1477  * lio_listio - initiate a list of IO requests.  We process the list of
1478  * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1479  * (mode == LIO_NOWAIT).
1480  *
1481  * The caller gets error and return status for each aiocb in the list
1482  * via aio_error and aio_return.  We must keep completed requests until
1483  * released by the aio_return call.
1484  */
1485 int
1486 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1487 {
1488         int                             i;
1489         int                             call_result;
1490         int                             result;
1491         int                             old_count;
1492         aio_workq_entry                 **entryp_listp;
1493         user_addr_t                     *aiocbpp;
1494         struct user_sigevent            aiosigev;
1495         aio_lio_context         *lio_context;
1496         boolean_t                       free_context = FALSE;
1497         uint32_t *paio_offset;
1498         uint32_t *paio_nbytes;
1499
1500         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1501             (int)p, uap->nent, uap->mode, 0, 0 );
1502
1503         entryp_listp = NULL;
1504         lio_context = NULL;
1505         aiocbpp = NULL;
1506         call_result = -1;
1507         *retval = -1;
1508         if (!(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT)) {
1509                 call_result = EINVAL;
1510                 goto ExitRoutine;
1511         }
1512
1513         if (uap->nent < 1 || uap->nent > AIO_LISTIO_MAX) {
1514                 call_result = EINVAL;
1515                 goto ExitRoutine;
1516         }
1517
1518         /*
1519          * allocate a list of aio_workq_entry pointers that we will use
1520          * to queue up all our requests at once while holding our lock.
1521          */
1522         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1523         if (entryp_listp == NULL) {
1524                 call_result = EAGAIN;
1525                 goto ExitRoutine;
1526         }
1527
1528         MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1529         if (lio_context == NULL) {
1530                 call_result = EAGAIN;
1531                 goto ExitRoutine;
1532         }
1533
1534 #if DEBUG
1535         OSIncrementAtomic(&lio_contexts_alloced);
1536 #endif /* DEBUG */
1537
1538         free_context = TRUE;
1539         bzero(lio_context, sizeof(aio_lio_context));
1540
1541         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1542         if (aiocbpp == NULL) {
1543                 call_result = EAGAIN;
1544                 goto ExitRoutine;
1545         }
1546
1547         /*
1548          * Use sigevent passed in to lio_listio for each of our calls, but
1549          * only do completion notification after the last request completes.
1550          */
1551         bzero(&aiosigev, sizeof(aiosigev));
1552         /* Only copy in an sigev if the user supplied one */
1553         if (uap->sigp != USER_ADDR_NULL) {
1554                 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1555                 if (call_result) {
1556                         goto ExitRoutine;
1557                 }
1558                 call_result = aio_sigev_validate(&aiosigev);
1559                 if (call_result) {
1560                         goto ExitRoutine;
1561                 }
1562         }
1563
1564         /* process list of aio requests */
1565         free_context = FALSE;
1566         lio_context->io_issued = uap->nent;
1567         lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
1568         for (i = 0; i < uap->nent; i++) {
1569                 user_addr_t my_aiocbp;
1570                 aio_workq_entry                         *entryp;
1571
1572                 *(entryp_listp + i) = NULL;
1573                 my_aiocbp = *(aiocbpp + i);
1574
1575                 /* NULL elements are legal so check for 'em */
1576                 if (my_aiocbp == USER_ADDR_NULL) {
1577                         aio_proc_lock_spin(p);
1578                         lio_context->io_issued--;
1579                         aio_proc_unlock(p);
1580                         continue;
1581                 }
1582
1583                 /*
1584                  * We use lio_context to mark IO requests for delayed completion
1585                  * processing which means we wait until all IO requests in the
1586                  * group have completed before we either return to the caller
1587                  * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1588                  *
1589                  * We use the address of the lio_context for this, since it is
1590                  * unique in the address space.
1591                  */
1592                 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i));
1593                 if (result != 0 && call_result == -1) {
1594                         call_result = result;
1595                 }
1596
1597                 /* NULL elements are legal so check for 'em */
1598                 entryp = *(entryp_listp + i);
1599                 if (entryp == NULL) {
1600                         aio_proc_lock_spin(p);
1601                         lio_context->io_issued--;
1602                         aio_proc_unlock(p);
1603                         continue;
1604                 }
1605
1606                 if (uap->mode == LIO_NOWAIT) {
1607                         /* Set signal hander, if any */
1608                         entryp->aiocb.aio_sigevent = aiosigev;
1609                 } else {
1610                         /* flag that this thread blocks pending completion */
1611                         entryp->flags |= AIO_LIO_NOTIFY;
1612                 }
1613
1614                 /* check our aio limits to throttle bad or rude user land behavior */
1615                 old_count = aio_increment_total_count();
1616
1617                 aio_proc_lock_spin(p);
1618                 if (old_count >= aio_max_requests ||
1619                     aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1620                     is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE) {
1621                         lio_context->io_issued--;
1622                         aio_proc_unlock(p);
1623
1624                         aio_decrement_total_count();
1625
1626                         if (call_result == -1) {
1627                                 call_result = EAGAIN;
1628                         }
1629                         aio_free_request(entryp);
1630                         entryp_listp[i] = NULL;
1631                         continue;
1632                 }
1633
1634                 lck_mtx_convert_spin(aio_proc_mutex(p));
1635                 aio_enqueue_work(p, entryp, 1);
1636                 aio_proc_unlock(p);
1637
1638                 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_START,
1639                     (int)p, (int)entryp->uaiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
1640                 paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
1641                 paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
1642                 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_END,
1643                     paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
1644                     paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
1645                     0 );
1646         }
1647
1648         aio_proc_lock_spin(p);
1649         switch (uap->mode) {
1650         case LIO_WAIT:
1651                 while (lio_context->io_completed < lio_context->io_issued) {
1652                         result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
1653
1654                         /* If we were interrupted, fail out (even if all finished) */
1655                         if (result != 0) {
1656                                 call_result = EINTR;
1657                                 lio_context->io_waiter = 0;
1658                                 break;
1659                         }
1660                 }
1661
1662                 /* If all IOs have finished must free it */
1663                 if (lio_context->io_completed == lio_context->io_issued) {
1664                         free_context = TRUE;
1665                 }
1666
1667                 break;
1668
1669         case LIO_NOWAIT:
1670                 /* If no IOs were issued must free it (rdar://problem/45717887) */
1671                 if (lio_context->io_issued == 0) {
1672                         free_context = TRUE;
1673                 }
1674                 break;
1675         }
1676         aio_proc_unlock(p);
1677
1678         /* call_result == -1 means we had no trouble queueing up requests */
1679         if (call_result == -1) {
1680                 call_result = 0;
1681                 *retval = 0;
1682         }
1683
1684 ExitRoutine:
1685         if (entryp_listp != NULL) {
1686                 FREE( entryp_listp, M_TEMP );
1687         }
1688         if (aiocbpp != NULL) {
1689                 FREE( aiocbpp, M_TEMP );
1690         }
1691         if (free_context) {
1692                 free_lio_context(lio_context);
1693         }
1694
1695         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1696             (int)p, call_result, 0, 0, 0 );
1697
1698         return call_result;
1699 } /* lio_listio */
1700
1701
1702 /*
1703  * aio worker thread.  this is where all the real work gets done.
1704  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1705  * after new work is queued up.
1706  */
1707 __attribute__((noreturn))
1708 static void
1709 aio_work_thread(void)
1710 {
1711         aio_workq_entry                 *entryp;
1712         int                     error;
1713         vm_map_t                currentmap;
1714         vm_map_t                oldmap = VM_MAP_NULL;
1715         task_t                  oldaiotask = TASK_NULL;
1716         struct uthread  *uthreadp = NULL;
1717
1718         for (;;) {
1719                 /*
1720                  * returns with the entry ref'ed.
1721                  * sleeps until work is available.
1722                  */
1723                 entryp = aio_get_some_work();
1724
1725                 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1726                     (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1727
1728                 /*
1729                  * Assume the target's address space identity for the duration
1730                  * of the IO.  Note: don't need to have the entryp locked,
1731                  * because the proc and map don't change until it's freed.
1732                  */
1733                 currentmap = get_task_map((current_proc())->task );
1734                 if (currentmap != entryp->aio_map) {
1735                         uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1736                         oldaiotask = uthreadp->uu_aio_task;
1737                         uthreadp->uu_aio_task = entryp->procp->task;
1738                         oldmap = vm_map_switch( entryp->aio_map );
1739                 }
1740
1741                 if ((entryp->flags & AIO_READ) != 0) {
1742                         error = do_aio_read( entryp );
1743                 } else if ((entryp->flags & AIO_WRITE) != 0) {
1744                         error = do_aio_write( entryp );
1745                 } else if ((entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0) {
1746                         error = do_aio_fsync( entryp );
1747                 } else {
1748                         printf( "%s - unknown aio request - flags 0x%02X \n",
1749                             __FUNCTION__, entryp->flags );
1750                         error = EINVAL;
1751                 }
1752
1753                 /* Restore old map */
1754                 if (currentmap != entryp->aio_map) {
1755                         (void) vm_map_switch( oldmap );
1756                         uthreadp->uu_aio_task = oldaiotask;
1757                 }
1758
1759                 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1760                     (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1761                     entryp->returnval, 0 );
1762
1763
1764                 /* XXX COUNTS */
1765                 aio_entry_lock_spin(entryp);
1766                 entryp->errorval = error;
1767                 aio_entry_unlock(entryp);
1768
1769                 /* we're done with the IO request so pop it off the active queue and */
1770                 /* push it on the done queue */
1771                 aio_proc_lock(entryp->procp);
1772                 aio_proc_move_done_locked(entryp->procp, entryp);
1773                 aio_proc_unlock(entryp->procp);
1774
1775                 OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1776
1777                 /* remove our reference to the user land map. */
1778                 if (VM_MAP_NULL != entryp->aio_map) {
1779                         vm_map_t                my_map;
1780
1781                         my_map = entryp->aio_map;
1782                         entryp->aio_map = VM_MAP_NULL;
1783                         vm_map_deallocate( my_map );
1784                 }
1785
1786                 /* Provide notifications */
1787                 do_aio_completion( entryp );
1788
1789                 /* Will free if needed */
1790                 aio_entry_unref(entryp);
1791         } /* for ( ;; ) */
1792
1793         /* NOT REACHED */
1794 } /* aio_work_thread */
1795
1796
1797 /*
1798  * aio_get_some_work - get the next async IO request that is ready to be executed.
1799  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1800  * IO requests at the time the aio_fsync call came in have completed.
1801  * NOTE - AIO_LOCK must be held by caller
1802  */
1803 static aio_workq_entry *
1804 aio_get_some_work( void )
1805 {
1806         aio_workq_entry                         *entryp = NULL;
1807         aio_workq_t                             queue = NULL;
1808
1809         /* Just one queue for the moment.  In the future there will be many. */
1810         queue = &aio_anchor.aio_async_workqs[0];
1811         aio_workq_lock_spin(queue);
1812         if (queue->aioq_count == 0) {
1813                 goto nowork;
1814         }
1815
1816         /*
1817          * Hold the queue lock.
1818          *
1819          * pop some work off the work queue and add to our active queue
1820          * Always start with the queue lock held.
1821          */
1822         for (;;) {
1823                 /*
1824                  * Pull of of work queue.  Once it's off, it can't be cancelled,
1825                  * so we can take our ref once we drop the queue lock.
1826                  */
1827                 entryp = TAILQ_FIRST(&queue->aioq_entries);
1828
1829                 /*
1830                  * If there's no work or only fsyncs that need delay, go to sleep
1831                  * and then start anew from aio_work_thread
1832                  */
1833                 if (entryp == NULL) {
1834                         goto nowork;
1835                 }
1836
1837                 aio_workq_remove_entry_locked(queue, entryp);
1838
1839                 aio_workq_unlock(queue);
1840
1841                 /*
1842                  * Check if it's an fsync that must be delayed.  No need to lock the entry;
1843                  * that flag would have been set at initialization.
1844                  */
1845                 if ((entryp->flags & AIO_FSYNC) != 0) {
1846                         /*
1847                          * Check for unfinished operations on the same file
1848                          * in this proc's queue.
1849                          */
1850                         aio_proc_lock_spin(entryp->procp);
1851                         if (aio_delay_fsync_request( entryp )) {
1852                                 /* It needs to be delayed.  Put it back on the end of the work queue */
1853                                 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1854                                     (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1855
1856                                 aio_proc_unlock(entryp->procp);
1857
1858                                 aio_workq_lock_spin(queue);
1859                                 aio_workq_add_entry_locked(queue, entryp);
1860                                 continue;
1861                         }
1862                         aio_proc_unlock(entryp->procp);
1863                 }
1864
1865                 break;
1866         }
1867
1868         aio_entry_ref(entryp);
1869
1870         OSIncrementAtomic(&aio_anchor.aio_inflight_count);
1871         return entryp;
1872
1873 nowork:
1874         /* We will wake up when someone enqueues something */
1875         waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
1876         aio_workq_unlock(queue);
1877         thread_block((thread_continue_t)aio_work_thread );
1878
1879         // notreached
1880         return NULL;
1881 }
1882
1883 /*
1884  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1885  * A big, simple hammer: only send it off if it's the most recently filed IO which has
1886  * not been completed.
1887  */
1888 static boolean_t
1889 aio_delay_fsync_request( aio_workq_entry *entryp )
1890 {
1891         if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1892                 return FALSE;
1893         }
1894
1895         return TRUE;
1896 } /* aio_delay_fsync_request */
1897
1898 static aio_workq_entry *
1899 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
1900 {
1901         aio_workq_entry *entryp;
1902         int             result = 0;
1903
1904         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1905         if (entryp == NULL) {
1906                 result = EAGAIN;
1907                 goto error_exit;
1908         }
1909
1910         bzero( entryp, sizeof(*entryp));
1911
1912         /* fill in the rest of the aio_workq_entry */
1913         entryp->procp = procp;
1914         entryp->uaiocbp = aiocbp;
1915         entryp->flags |= kindOfIO;
1916         entryp->group_tag = group_tag;
1917         entryp->aio_map = VM_MAP_NULL;
1918         entryp->aio_refcount = 0;
1919
1920         if (proc_is64bit(procp)) {
1921                 struct user64_aiocb aiocb64;
1922
1923                 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64));
1924                 if (result == 0) {
1925                         do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1926                 }
1927         } else {
1928                 struct user32_aiocb aiocb32;
1929
1930                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32));
1931                 if (result == 0) {
1932                         do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
1933                 }
1934         }
1935
1936         if (result != 0) {
1937                 result = EAGAIN;
1938                 goto error_exit;
1939         }
1940
1941         /* get a reference to the user land map in order to keep it around */
1942         entryp->aio_map = get_task_map( procp->task );
1943         vm_map_reference( entryp->aio_map );
1944
1945         /* do some more validation on the aiocb and embedded file descriptor */
1946         result = aio_validate( entryp );
1947         if (result != 0) {
1948                 goto error_exit_with_ref;
1949         }
1950
1951         /* get a reference on the current_thread, which is passed in vfs_context. */
1952         entryp->thread = current_thread();
1953         thread_reference( entryp->thread );
1954         return entryp;
1955
1956 error_exit_with_ref:
1957         if (VM_MAP_NULL != entryp->aio_map) {
1958                 vm_map_deallocate( entryp->aio_map );
1959         }
1960 error_exit:
1961         if (result && entryp != NULL) {
1962                 zfree( aio_workq_zonep, entryp );
1963                 entryp = NULL;
1964         }
1965
1966         return entryp;
1967 }
1968
1969
1970 /*
1971  * aio_queue_async_request - queue up an async IO request on our work queue then
1972  * wake up one of our worker threads to do the actual work.  We get a reference
1973  * to our caller's user land map in order to keep it around while we are
1974  * processing the request.
1975  */
1976 static int
1977 aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1978 {
1979         aio_workq_entry *entryp;
1980         int              result;
1981         int              old_count;
1982         uint32_t *paio_offset;
1983         uint32_t *paio_nbytes;
1984
1985         old_count = aio_increment_total_count();
1986         if (old_count >= aio_max_requests) {
1987                 result = EAGAIN;
1988                 goto error_noalloc;
1989         }
1990
1991         entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1992         if (entryp == NULL) {
1993                 result = EAGAIN;
1994                 goto error_noalloc;
1995         }
1996
1997
1998         aio_proc_lock_spin(procp);
1999
2000         if (is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE) {
2001                 result = EAGAIN;
2002                 goto error_exit;
2003         }
2004
2005         /* check our aio limits to throttle bad or rude user land behavior */
2006         if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
2007                 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
2008                 result = EAGAIN;
2009                 goto error_exit;
2010         }
2011
2012         /* Add the IO to proc and work queues, wake up threads as appropriate */
2013         lck_mtx_convert_spin(aio_proc_mutex(procp));
2014         aio_enqueue_work(procp, entryp, 1);
2015
2016         aio_proc_unlock(procp);
2017
2018         paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
2019         paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
2020         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_START,
2021             (int)procp, (int)aiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
2022         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_END,
2023             paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
2024             paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
2025             0 );
2026
2027         return 0;
2028
2029 error_exit:
2030         /*
2031          * This entry has not been queued up so no worries about
2032          * unlocked state and aio_map
2033          */
2034         aio_proc_unlock(procp);
2035         aio_free_request(entryp);
2036
2037 error_noalloc:
2038         aio_decrement_total_count();
2039
2040         return result;
2041 } /* aio_queue_async_request */
2042
2043
2044 /*
2045  * lio_create_entry
2046  *
2047  * Allocate an aio_workq_entry and fill it in.  If all goes well return 0
2048  * and pass the aio_workq_entry pointer back to our caller.
2049  *
2050  * Parameters:  procp                   The process makign the request
2051  *              aiocbp                  The aio context buffer pointer
2052  *              group_tag               The group tag used to indicate a
2053  *                                      group of operations has completed
2054  *              entrypp                 Pointer to the pointer to receive the
2055  *                                      address of the created aio_workq_entry
2056  *
2057  * Returns:     0                       Successfully created
2058  *              EAGAIN                  Try again (usually resource shortage)
2059  *
2060  *
2061  * Notes:       We get a reference to our caller's user land map in order
2062  *              to keep it around while we are processing the request.
2063  *
2064  *              lio_listio calls behave differently at completion they do
2065  *              completion notification when all async IO requests have
2066  *              completed.  We use group_tag to tag IO requests that behave
2067  *              in the delay notification manner.
2068  *
2069  *              All synchronous operations are considered to not have a
2070  *              signal routine associated with them (sigp == USER_ADDR_NULL).
2071  */
2072 static int
2073 lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2074     aio_workq_entry **entrypp )
2075 {
2076         aio_workq_entry *entryp;
2077         int             result;
2078
2079         entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2080         if (entryp == NULL) {
2081                 result = EAGAIN;
2082                 goto error_exit;
2083         }
2084
2085         /*
2086          * Look for lio_listio LIO_NOP requests and ignore them; this is
2087          * not really an error, but we need to free our aio_workq_entry.
2088          */
2089         if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
2090                 result = 0;
2091                 goto error_exit;
2092         }
2093
2094         *entrypp = entryp;
2095         return 0;
2096
2097 error_exit:
2098
2099         if (entryp != NULL) {
2100                 /*
2101                  * This entry has not been queued up so no worries about
2102                  * unlocked state and aio_map
2103                  */
2104                 aio_free_request(entryp);
2105         }
2106
2107         return result;
2108 } /* lio_create_entry */
2109
2110
2111 /*
2112  * aio_free_request - remove our reference on the user land map and
2113  * free the work queue entry resources.  The entry is off all lists
2114  * and has zero refcount, so no one can have a pointer to it.
2115  */
2116
2117 static int
2118 aio_free_request(aio_workq_entry *entryp)
2119 {
2120         /* remove our reference to the user land map. */
2121         if (VM_MAP_NULL != entryp->aio_map) {
2122                 vm_map_deallocate(entryp->aio_map);
2123         }
2124
2125         /* remove our reference to thread which enqueued the request */
2126         if (NULL != entryp->thread) {
2127                 thread_deallocate( entryp->thread );
2128         }
2129
2130         entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2131
2132         zfree( aio_workq_zonep, entryp );
2133
2134         return 0;
2135 } /* aio_free_request */
2136
2137
2138 /*
2139  * aio_validate
2140  *
2141  * validate the aiocb passed in by one of the aio syscalls.
2142  */
2143 static int
2144 aio_validate( aio_workq_entry *entryp )
2145 {
2146         struct fileproc                                 *fp;
2147         int                                                     flag;
2148         int                                                     result;
2149
2150         result = 0;
2151
2152         if ((entryp->flags & AIO_LIO) != 0) {
2153                 if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
2154                         entryp->flags |= AIO_READ;
2155                 } else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
2156                         entryp->flags |= AIO_WRITE;
2157                 } else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
2158                         return 0;
2159                 } else {
2160                         return EINVAL;
2161                 }
2162         }
2163
2164         flag = FREAD;
2165         if ((entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0) {
2166                 flag = FWRITE;
2167         }
2168
2169         if ((entryp->flags & (AIO_READ | AIO_WRITE)) != 0) {
2170                 if (entryp->aiocb.aio_nbytes > INT_MAX ||
2171                     entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2172                     entryp->aiocb.aio_offset < 0) {
2173                         return EINVAL;
2174                 }
2175         }
2176
2177         result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
2178         if (result) {
2179                 return result;
2180         }
2181
2182         /* validate the file descriptor and that the file was opened
2183          * for the appropriate read / write access.
2184          */
2185         proc_fdlock(entryp->procp);
2186
2187         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp, 1);
2188         if (result == 0) {
2189                 if ((fp->f_fglob->fg_flag & flag) == 0) {
2190                         /* we don't have read or write access */
2191                         result = EBADF;
2192                 } else if (FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE) {
2193                         /* this is not a file */
2194                         result = ESPIPE;
2195                 } else {
2196                         fp->f_flags |= FP_AIOISSUED;
2197                 }
2198
2199                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 1);
2200         } else {
2201                 result = EBADF;
2202         }
2203
2204         proc_fdunlock(entryp->procp);
2205
2206         return result;
2207 } /* aio_validate */
2208
2209 static int
2210 aio_increment_total_count()
2211 {
2212         return OSIncrementAtomic(&aio_anchor.aio_total_count);
2213 }
2214
2215 static int
2216 aio_decrement_total_count()
2217 {
2218         int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2219         if (old <= 0) {
2220                 panic("Negative total AIO count!\n");
2221         }
2222
2223         return old;
2224 }
2225
2226 static int
2227 aio_get_process_count(proc_t procp )
2228 {
2229         return procp->p_aio_total_count;
2230 } /* aio_get_process_count */
2231
2232 static int
2233 aio_get_all_queues_count( void )
2234 {
2235         return aio_anchor.aio_total_count;
2236 } /* aio_get_all_queues_count */
2237
2238
2239 /*
2240  * do_aio_completion.  Handle async IO completion.
2241  */
2242 static void
2243 do_aio_completion( aio_workq_entry *entryp )
2244 {
2245         boolean_t               lastLioCompleted = FALSE;
2246         aio_lio_context *lio_context = NULL;
2247         int waiter = 0;
2248
2249         lio_context = (aio_lio_context *)entryp->group_tag;
2250
2251         if (lio_context != NULL) {
2252                 aio_proc_lock_spin(entryp->procp);
2253
2254                 /* Account for this I/O completing. */
2255                 lio_context->io_completed++;
2256
2257                 /* Are we done with this lio context? */
2258                 if (lio_context->io_issued == lio_context->io_completed) {
2259                         lastLioCompleted = TRUE;
2260                 }
2261
2262                 waiter = lio_context->io_waiter;
2263
2264                 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2265                 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2266                         /* wake up the waiter */
2267                         wakeup(lio_context);
2268                 }
2269
2270                 aio_proc_unlock(entryp->procp);
2271         }
2272
2273         if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2274             (entryp->flags & AIO_DISABLE) == 0) {
2275                 boolean_t       performSignal = FALSE;
2276                 if (lio_context == NULL) {
2277                         performSignal = TRUE;
2278                 } else {
2279                         /*
2280                          * If this was the last request in the group and a signal
2281                          * is desired, send one.
2282                          */
2283                         performSignal = lastLioCompleted;
2284                 }
2285
2286                 if (performSignal) {
2287                         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
2288                             (int)entryp->procp, (int)entryp->uaiocbp,
2289                             entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
2290
2291                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
2292                 }
2293         }
2294
2295         if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2296                 panic("Close and exit flags set at the same time\n");
2297         }
2298
2299         /*
2300          * need to handle case where a process is trying to exit, exec, or
2301          * close and is currently waiting for active aio requests to complete.
2302          * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2303          * other requests in the active queue for this process.  If there are
2304          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2305          * If there are some still active then do nothing - we only want to
2306          * wakeup when all active aio requests for the process are complete.
2307          *
2308          * Don't need to lock the entry or proc to check the cleanup flag.  It can only be
2309          * set for cancellation, while the entryp is still on a proc list; now it's
2310          * off, so that flag is already set if it's going to be.
2311          */
2312         if ((entryp->flags & AIO_EXIT_WAIT) != 0) {
2313                 int             active_requests;
2314
2315                 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2316                     (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2317
2318                 aio_proc_lock_spin(entryp->procp);
2319                 active_requests = aio_active_requests_for_process( entryp->procp );
2320                 if (active_requests < 1) {
2321                         /*
2322                          * no active aio requests for this process, continue exiting.  In this
2323                          * case, there should be no one else waiting ont he proc in AIO...
2324                          */
2325                         wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2326                         aio_proc_unlock(entryp->procp);
2327
2328                         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2329                             (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2330                 } else {
2331                         aio_proc_unlock(entryp->procp);
2332                 }
2333         }
2334
2335         if ((entryp->flags & AIO_CLOSE_WAIT) != 0) {
2336                 int             active_requests;
2337
2338                 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2339                     (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2340
2341                 aio_proc_lock_spin(entryp->procp);
2342                 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2343                 if (active_requests < 1) {
2344                         /* Can't wakeup_one(); multiple closes might be in progress. */
2345                         wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2346                         aio_proc_unlock(entryp->procp);
2347
2348                         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2349                             (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2350                 } else {
2351                         aio_proc_unlock(entryp->procp);
2352                 }
2353         }
2354         /*
2355          * A thread in aio_suspend() wants to known about completed IOs.  If it checked
2356          * the done list before we moved our AIO there, then it already asserted its wait,
2357          * and we can wake it up without holding the lock.  If it checked the list after
2358          * we did our move, then it already has seen the AIO that we moved.  Herego, we
2359          * can do our wakeup without holding the lock.
2360          */
2361         wakeup((caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
2362         KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2363             (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2364
2365         /*
2366          * free the LIO context if the last lio completed and no thread is
2367          * waiting
2368          */
2369         if (lastLioCompleted && (waiter == 0)) {
2370                 free_lio_context(lio_context);
2371         }
2372 } /* do_aio_completion */
2373
2374
2375 /*
2376  * do_aio_read
2377  */
2378 static int
2379 do_aio_read( aio_workq_entry *entryp )
2380 {
2381         struct fileproc         *fp;
2382         int                                     error;
2383         struct vfs_context      context;
2384
2385         if ((error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp, 0))) {
2386                 return error;
2387         }
2388         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
2389                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2390                 return EBADF;
2391         }
2392
2393         context.vc_thread = entryp->thread;     /* XXX */
2394         context.vc_ucred = fp->f_fglob->fg_cred;
2395
2396         error = dofileread(&context, fp,
2397             entryp->aiocb.aio_buf,
2398             entryp->aiocb.aio_nbytes,
2399             entryp->aiocb.aio_offset, FOF_OFFSET,
2400             &entryp->returnval);
2401         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2402
2403         return error;
2404 } /* do_aio_read */
2405
2406
2407 /*
2408  * do_aio_write
2409  */
2410 static int
2411 do_aio_write( aio_workq_entry *entryp )
2412 {
2413         struct fileproc                 *fp;
2414         int                             error, flags;
2415         struct vfs_context              context;
2416
2417         if ((error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp, 0))) {
2418                 return error;
2419         }
2420         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
2421                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2422                 return EBADF;
2423         }
2424
2425         flags = FOF_PCRED;
2426         if ((fp->f_fglob->fg_flag & O_APPEND) == 0) {
2427                 flags |= FOF_OFFSET;
2428         }
2429
2430         context.vc_thread = entryp->thread;     /* XXX */
2431         context.vc_ucred = fp->f_fglob->fg_cred;
2432
2433         /* NB: tell dofilewrite the offset, and to use the proc cred */
2434         error = dofilewrite(&context,
2435             fp,
2436             entryp->aiocb.aio_buf,
2437             entryp->aiocb.aio_nbytes,
2438             entryp->aiocb.aio_offset,
2439             flags,
2440             &entryp->returnval);
2441
2442         if (entryp->returnval) {
2443                 fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
2444         } else {
2445                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2446         }
2447
2448         return error;
2449 } /* do_aio_write */
2450
2451
2452 /*
2453  * aio_active_requests_for_process - return number of active async IO
2454  * requests for the given process.
2455  */
2456 static int
2457 aio_active_requests_for_process(proc_t procp )
2458 {
2459         return procp->p_aio_active_count;
2460 } /* aio_active_requests_for_process */
2461
2462 /*
2463  * Called with the proc locked.
2464  */
2465 static int
2466 aio_proc_active_requests_for_file(proc_t procp, int fd)
2467 {
2468         int count = 0;
2469         aio_workq_entry *entryp;
2470         TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2471                 if (entryp->aiocb.aio_fildes == fd) {
2472                         count++;
2473                 }
2474         }
2475
2476         return count;
2477 } /* aio_active_requests_for_process */
2478
2479
2480
2481 /*
2482  * do_aio_fsync
2483  */
2484 static int
2485 do_aio_fsync( aio_workq_entry *entryp )
2486 {
2487         struct vfs_context      context;
2488         struct vnode            *vp;
2489         struct fileproc         *fp;
2490         int                     sync_flag;
2491         int                     error;
2492
2493         /*
2494          * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2495          *
2496          * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2497          * to mark for update the metadata not strictly necessary for data
2498          * retrieval, rather than forcing it to disk.
2499          *
2500          * If AIO_FSYNC is set, we have to also wait for metadata not really
2501          * necessary to data retrival are committed to stable storage (e.g.
2502          * atime, mtime, ctime, etc.).
2503          *
2504          * Metadata necessary for data retrieval ust be committed to stable
2505          * storage in either case (file length, etc.).
2506          */
2507         if (entryp->flags & AIO_FSYNC) {
2508                 sync_flag = MNT_WAIT;
2509         } else {
2510                 sync_flag = MNT_DWAIT;
2511         }
2512
2513         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2514         if (error == 0) {
2515                 if ((error = vnode_getwithref(vp))) {
2516                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2517                         entryp->returnval = -1;
2518                         return error;
2519                 }
2520                 context.vc_thread = current_thread();
2521                 context.vc_ucred = fp->f_fglob->fg_cred;
2522
2523                 error = VNOP_FSYNC( vp, sync_flag, &context);
2524
2525                 (void)vnode_put(vp);
2526
2527                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2528         }
2529         if (error != 0) {
2530                 entryp->returnval = -1;
2531         }
2532
2533         return error;
2534 } /* do_aio_fsync */
2535
2536
2537 /*
2538  * is_already_queued - runs through our queues to see if the given
2539  * aiocbp / process is there.  Returns TRUE if there is a match
2540  * on any of our aio queues.
2541  *
2542  * Called with proc aio lock held (can be held spin)
2543  */
2544 static boolean_t
2545 is_already_queued(proc_t procp,
2546     user_addr_t aiocbp )
2547 {
2548         aio_workq_entry                 *entryp;
2549         boolean_t                               result;
2550
2551         result = FALSE;
2552
2553         /* look for matches on our queue of async IO requests that have completed */
2554         TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
2555                 if (aiocbp == entryp->uaiocbp) {
2556                         result = TRUE;
2557                         goto ExitThisRoutine;
2558                 }
2559         }
2560
2561         /* look for matches on our queue of active async IO requests */
2562         TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
2563                 if (aiocbp == entryp->uaiocbp) {
2564                         result = TRUE;
2565                         goto ExitThisRoutine;
2566                 }
2567         }
2568
2569 ExitThisRoutine:
2570         return result;
2571 } /* is_already_queued */
2572
2573
2574 static void
2575 free_lio_context(aio_lio_context* context)
2576 {
2577 #if DEBUG
2578         OSDecrementAtomic(&lio_contexts_alloced);
2579 #endif /* DEBUG */
2580
2581         FREE( context, M_TEMP );
2582 } /* free_lio_context */
2583
2584
2585 /*
2586  * aio initialization
2587  */
2588 __private_extern__ void
2589 aio_init( void )
2590 {
2591         int                     i;
2592
2593         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2594         aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2595         aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2596         aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
2597         aio_lock_attr = lck_attr_alloc_init();
2598
2599         lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2600         lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
2601
2602         aio_anchor.aio_inflight_count = 0;
2603         aio_anchor.aio_done_count = 0;
2604         aio_anchor.aio_total_count = 0;
2605         aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2606
2607         for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2608                 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2609         }
2610
2611
2612         i = sizeof(aio_workq_entry);
2613         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2614
2615         _aio_create_worker_threads( aio_worker_threads );
2616 } /* aio_init */
2617
2618
2619 /*
2620  * aio worker threads created here.
2621  */
2622 __private_extern__ void
2623 _aio_create_worker_threads( int num )
2624 {
2625         int                     i;
2626
2627         /* create some worker threads to handle the async IO requests */
2628         for (i = 0; i < num; i++) {
2629                 thread_t                myThread;
2630
2631                 if (KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread)) {
2632                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2633                 } else {
2634                         thread_deallocate(myThread);
2635                 }
2636         }
2637
2638         return;
2639 } /* _aio_create_worker_threads */
2640
2641 /*
2642  * Return the current activation utask
2643  */
2644 task_t
2645 get_aiotask(void)
2646 {
2647         return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2648 }
2649
2650
2651 /*
2652  * In the case of an aiocb from a
2653  * 32-bit process we need to expand some longs and pointers to the correct
2654  * sizes in order to let downstream code always work on the same type of
2655  * aiocb (in our case that is a user_aiocb)
2656  */
2657 static void
2658 do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2659 {
2660         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2661         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2662         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2663         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2664         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2665         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2666
2667         /* special case here.  since we do not know if sigev_value is an */
2668         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2669         /* means if we send this info back to user space we need to remember */
2670         /* sigev_value was not expanded for the 32-bit case.  */
2671         /* NOTE - this does NOT affect us since we don't support sigev_value */
2672         /* yet in the aio context.  */
2673         //LP64
2674         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2675         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2676         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2677             my_aiocbp->aio_sigevent.sigev_value.sival_int;
2678         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2679             CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2680         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2681             CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2682 }
2683
2684 /* Similar for 64-bit user process, so that we don't need to satisfy
2685  * the alignment constraints of the original user64_aiocb
2686  */
2687 static void
2688 do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2689 {
2690         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2691         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2692         the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2693         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2694         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2695         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2696
2697         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2698         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2699         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2700             my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2701         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2702             my_aiocbp->aio_sigevent.sigev_notify_function;
2703         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2704             my_aiocbp->aio_sigevent.sigev_notify_attributes;
2705 }