bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29
  30 /*
  31  * todo:
  32  *              1) ramesh is looking into how to replace taking a reference on
  33  *                      the user's map (vm_map_reference()) since it is believed that
  34  *                      would not hold the process for us.
  35  *              2) david is looking into a way for us to set the priority of the
  36  *                      worker threads to match that of the user's thread when the
  37  *                      async IO was queued.
  38  */
  39
  40
  41 /*
  42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  43  */
  44
  45 #include <sys/systm.h>
  46 #include <sys/fcntl.h>
  47 #include <sys/file_internal.h>
  48 #include <sys/filedesc.h>
  49 #include <sys/kernel.h>
  50 #include <sys/vnode_internal.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount_internal.h>
  53 #include <sys/param.h>
  54 #include <sys/proc_internal.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/unistd.h>
  57 #include <sys/user.h>
  58
  59 #include <sys/aio_kern.h>
  60 #include <sys/sysproto.h>
  61
  62 #include <machine/limits.h>
  63
  64 #include <mach/mach_types.h>
  65 #include <kern/kern_types.h>
  66 #include <kern/waitq.h>
  67 #include <kern/zalloc.h>
  68 #include <kern/task.h>
  69 #include <kern/sched_prim.h>
  70
  71 #include <vm/vm_map.h>
  72
  73 #include <libkern/OSAtomic.h>
  74
  75 #include <sys/kdebug.h>
  76 #define AIO_work_queued                                 1
  77 #define AIO_worker_wake                                 2
  78 #define AIO_completion_sig                              3
  79 #define AIO_completion_cleanup_wait             4
  80 #define AIO_completion_cleanup_wake             5
  81 #define AIO_completion_suspend_wake     6
  82 #define AIO_fsync_delay                                 7
  83 #define AIO_cancel                                              10
  84 #define AIO_cancel_async_workq                  11
  85 #define AIO_cancel_sync_workq                   12
  86 #define AIO_cancel_activeq                              13
  87 #define AIO_cancel_doneq                                14
  88 #define AIO_fsync                                               20
  89 #define AIO_read                                                30
  90 #define AIO_write                                               40
  91 #define AIO_listio                                              50
  92 #define AIO_error                                               60
  93 #define AIO_error_val                                   61
  94 #define AIO_error_activeq                               62
  95 #define AIO_error_workq                                 63
  96 #define AIO_return                                              70
  97 #define AIO_return_val                                  71
  98 #define AIO_return_activeq                              72
  99 #define AIO_return_workq                                73
 100 #define AIO_exec                                                80
 101 #define AIO_exit                                                90
 102 #define AIO_exit_sleep                                  91
 103 #define AIO_close                                               100
 104 #define AIO_close_sleep                                 101
 105 #define AIO_suspend                                             110
 106 #define AIO_suspend_sleep                               111
 107 #define AIO_worker_thread                               120
 108
 109 #if 0
 110 #undef KERNEL_DEBUG
 111 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 112 #endif
 113
 114 /*
 115  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 116  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 117  * (proc.aio_activeq) when one of our worker threads start the IO.
 118  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 119  * when the IO request completes.  The request remains on aio_doneq until
 120  * user process calls aio_return or the process exits, either way that is our
 121  * trigger to release aio resources.
 122  */
 123 typedef struct aio_workq   {
 124         TAILQ_HEAD(, aio_workq_entry)   aioq_entries;
 125         int                             aioq_count;
 126         lck_mtx_t                       aioq_mtx;
 127         struct waitq                    aioq_waitq;
 128 } *aio_workq_t;
 129
 130 #define AIO_NUM_WORK_QUEUES 1
 131 struct aio_anchor_cb
 132 {
 133         volatile int32_t        aio_inflight_count;     /* entries that have been taken from a workq */
 134         volatile int32_t        aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 135         volatile int32_t        aio_total_count;        /* total extant entries */
 136
 137         /* Hash table of queues here */
 138         int                     aio_num_workqs;
 139         struct aio_workq        aio_async_workqs[AIO_NUM_WORK_QUEUES];
 140 };
 141 typedef struct aio_anchor_cb aio_anchor_cb;
 142
 143 struct aio_lio_context
 144 {
 145         int             io_waiter;
 146         int             io_issued;
 147         int             io_completed;
 148 };
 149 typedef struct aio_lio_context aio_lio_context;
 150
 151
 152 /*
 153  * Notes on aio sleep / wake channels.
 154  * We currently pick a couple fields within the proc structure that will allow
 155  * us sleep channels that currently do not collide with any other kernel routines.
 156  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 157  */
 158 #define AIO_SUSPEND_SLEEP_CHAN  p_aio_active_count
 159 #define AIO_CLEANUP_SLEEP_CHAN  p_aio_total_count
 160
 161 #define ASSERT_AIO_FROM_PROC(aiop, theproc)     \
 162         if ((aiop)->procp != (theproc)) {       \
 163                 panic("AIO on a proc list that does not belong to that proc.\n"); \
 164         }
 165
 166 /*
 167  *  LOCAL PROTOTYPES
 168  */
 169 static void             aio_proc_lock(proc_t procp);
 170 static void             aio_proc_lock_spin(proc_t procp);
 171 static void             aio_proc_unlock(proc_t procp);
 172 static lck_mtx_t*       aio_proc_mutex(proc_t procp);
 173 static void             aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
 174 static void             aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
 175 static int              aio_get_process_count(proc_t procp );
 176 static int              aio_active_requests_for_process(proc_t procp );
 177 static int              aio_proc_active_requests_for_file(proc_t procp, int fd);
 178 static boolean_t        is_already_queued(proc_t procp, user_addr_t aiocbp );
 179 static boolean_t        should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
 180
 181 static void             aio_entry_lock(aio_workq_entry *entryp);
 182 static void             aio_entry_lock_spin(aio_workq_entry *entryp);
 183 static aio_workq_t      aio_entry_workq(aio_workq_entry *entryp);
 184 static lck_mtx_t*       aio_entry_mutex(__unused aio_workq_entry *entryp);
 185 static void             aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 186 static void             aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 187 static void             aio_entry_ref_locked(aio_workq_entry *entryp);
 188 static void             aio_entry_unref_locked(aio_workq_entry *entryp);
 189 static void             aio_entry_ref(aio_workq_entry *entryp);
 190 static void             aio_entry_unref(aio_workq_entry *entryp);
 191 static void             aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
 192                                         int wait_for_completion, boolean_t disable_notification);
 193 static int              aio_entry_try_workq_remove(aio_workq_entry *entryp);
 194 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 195 static int              aio_free_request(aio_workq_entry *entryp);
 196
 197 static void             aio_workq_init(aio_workq_t wq);
 198 static void             aio_workq_lock_spin(aio_workq_t wq);
 199 static void             aio_workq_unlock(aio_workq_t wq);
 200 static lck_mtx_t*       aio_workq_mutex(aio_workq_t wq);
 201
 202 static void             aio_work_thread( void );
 203 static aio_workq_entry *aio_get_some_work( void );
 204
 205 static int              aio_get_all_queues_count( void );
 206 static int              aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
 207 static int              aio_validate( aio_workq_entry *entryp );
 208 static int              aio_increment_total_count(void);
 209 static int              aio_decrement_total_count(void);
 210
 211 static int              do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
 212 static void             do_aio_completion( aio_workq_entry *entryp );
 213 static int              do_aio_fsync( aio_workq_entry *entryp );
 214 static int              do_aio_read( aio_workq_entry *entryp );
 215 static int              do_aio_write( aio_workq_entry *entryp );
 216 static void             do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 217 static void             do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 218 static int      lio_create_entry(proc_t procp,
 219                                          user_addr_t aiocbp,
 220                                          void *group_tag,
 221                                          aio_workq_entry **entrypp );
 222 static aio_workq_entry *aio_create_queue_entry(proc_t procp,
 223                                         user_addr_t aiocbp,
 224                                         void *group_tag,
 225                                         int kindOfIO);
 226 static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
 227 static void             free_lio_context(aio_lio_context* context);
 228 static void             aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
 229
 230 #define ASSERT_AIO_PROC_LOCK_OWNED(p)   lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
 231 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q)  lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
 232 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e)  lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
 233
 234 /*
 235  *  EXTERNAL PROTOTYPES
 236  */
 237
 238 /* in ...bsd/kern/sys_generic.c */
 239 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
 240                         user_addr_t bufp, user_size_t nbyte,
 241                         off_t offset, int flags, user_ssize_t *retval );
 242 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 243                          user_addr_t bufp, user_size_t nbyte, off_t offset,
 244                          int flags, user_ssize_t *retval );
 245 #if DEBUG
 246 static uint32_t                         lio_contexts_alloced = 0;
 247 #endif  /* DEBUG */
 248
 249 /*
 250  * aio external global variables.
 251  */
 252 extern int aio_max_requests;                    /* AIO_MAX - configurable */
 253 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 254 extern int aio_worker_threads;                  /* AIO_THREAD_COUNT - configurable */
 255
 256
 257 /*
 258  * aio static variables.
 259  */
 260 static aio_anchor_cb    aio_anchor;
 261 static lck_grp_t        *aio_proc_lock_grp;
 262 static lck_grp_t        *aio_entry_lock_grp;
 263 static lck_grp_t        *aio_queue_lock_grp;
 264 static lck_attr_t       *aio_lock_attr;
 265 static lck_grp_attr_t   *aio_lock_grp_attr;
 266 static struct zone      *aio_workq_zonep;
 267 static lck_mtx_t        aio_entry_mtx;
 268 static lck_mtx_t        aio_proc_mtx;
 269
 270 static void
 271 aio_entry_lock(__unused aio_workq_entry *entryp)
 272 {
 273         lck_mtx_lock(&aio_entry_mtx);
 274 }
 275
 276 static void
 277 aio_entry_lock_spin(__unused aio_workq_entry *entryp)
 278 {
 279         lck_mtx_lock_spin(&aio_entry_mtx);
 280 }
 281
 282 static void
 283 aio_entry_unlock(__unused aio_workq_entry *entryp)
 284 {
 285         lck_mtx_unlock(&aio_entry_mtx);
 286 }
 287
 288 /* Hash */
 289 static aio_workq_t
 290 aio_entry_workq(__unused aio_workq_entry *entryp)
 291 {
 292         return &aio_anchor.aio_async_workqs[0];
 293 }
 294
 295 static lck_mtx_t*
 296 aio_entry_mutex(__unused aio_workq_entry *entryp)
 297 {
 298         return &aio_entry_mtx;
 299 }
 300
 301 static void
 302 aio_workq_init(aio_workq_t wq)
 303 {
 304         TAILQ_INIT(&wq->aioq_entries);
 305         wq->aioq_count = 0;
 306         lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
 307         waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
 308 }
 309
 310
 311 /*
 312  * Can be passed a queue which is locked spin.
 313  */
 314 static void
 315 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 316 {
 317         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 318
 319         if (entryp->aio_workq_link.tqe_prev == NULL) {
 320                 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
 321         }
 322
 323         TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
 324         queue->aioq_count--;
 325         entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
 326
 327         if (queue->aioq_count  < 0) {
 328                 panic("Negative count on a queue.\n");
 329         }
 330 }
 331
 332 static void
 333 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 334 {
 335         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 336
 337         TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
 338         if (queue->aioq_count  < 0) {
 339                 panic("Negative count on a queue.\n");
 340         }
 341         queue->aioq_count++;
 342 }
 343
 344 static void
 345 aio_proc_lock(proc_t procp)
 346 {
 347         lck_mtx_lock(aio_proc_mutex(procp));
 348 }
 349
 350 static void
 351 aio_proc_lock_spin(proc_t procp)
 352 {
 353         lck_mtx_lock_spin(aio_proc_mutex(procp));
 354 }
 355
 356 static void
 357 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
 358 {
 359         ASSERT_AIO_PROC_LOCK_OWNED(procp);
 360
 361         TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
 362         TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
 363         procp->p_aio_active_count--;
 364         OSIncrementAtomic(&aio_anchor.aio_done_count);
 365 }
 366
 367 static void
 368 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
 369 {
 370         TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
 371         OSDecrementAtomic(&aio_anchor.aio_done_count);
 372         aio_decrement_total_count();
 373         procp->p_aio_total_count--;
 374 }
 375
 376 static void
 377 aio_proc_unlock(proc_t procp)
 378 {
 379         lck_mtx_unlock(aio_proc_mutex(procp));
 380 }
 381
 382 static lck_mtx_t*
 383 aio_proc_mutex(proc_t procp)
 384 {
 385         return &procp->p_mlock;
 386 }
 387
 388 static void
 389 aio_entry_ref_locked(aio_workq_entry *entryp)
 390 {
 391         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 392
 393         if (entryp->aio_refcount < 0) {
 394                 panic("AIO workq entry with a negative refcount.\n");
 395         }
 396         entryp->aio_refcount++;
 397 }
 398
 399
 400 /* Return 1 if you've freed it */
 401 static void
 402 aio_entry_unref_locked(aio_workq_entry *entryp)
 403 {
 404         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 405
 406         entryp->aio_refcount--;
 407         if (entryp->aio_refcount < 0) {
 408                 panic("AIO workq entry with a negative refcount.\n");
 409         }
 410 }
 411
 412 static void
 413 aio_entry_ref(aio_workq_entry *entryp)
 414 {
 415         aio_entry_lock_spin(entryp);
 416         aio_entry_ref_locked(entryp);
 417         aio_entry_unlock(entryp);
 418 }
 419 static void
 420 aio_entry_unref(aio_workq_entry *entryp)
 421 {
 422         aio_entry_lock_spin(entryp);
 423         aio_entry_unref_locked(entryp);
 424
 425         if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
 426                 aio_entry_unlock(entryp);
 427                 aio_free_request(entryp);
 428         } else {
 429                 aio_entry_unlock(entryp);
 430         }
 431
 432         return;
 433 }
 434
 435 static void
 436 aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
 437 {
 438         aio_entry_lock_spin(entryp);
 439
 440         if (cancelled) {
 441                 aio_entry_ref_locked(entryp);
 442                 entryp->errorval = ECANCELED;
 443                 entryp->returnval = -1;
 444         }
 445
 446         if ( wait_for_completion ) {
 447                 entryp->flags |= wait_for_completion; /* flag for special completion processing */
 448         }
 449
 450         if ( disable_notification ) {
 451                 entryp->flags |= AIO_DISABLE; /* Don't want a signal */
 452         }
 453
 454         aio_entry_unlock(entryp);
 455 }
 456
 457 static int
 458 aio_entry_try_workq_remove(aio_workq_entry *entryp)
 459 {
 460         /* Can only be cancelled if it's still on a work queue */
 461         if (entryp->aio_workq_link.tqe_prev != NULL) {
 462                 aio_workq_t queue;
 463
 464                 /* Will have to check again under the lock */
 465                 queue = aio_entry_workq(entryp);
 466                 aio_workq_lock_spin(queue);
 467                 if (entryp->aio_workq_link.tqe_prev != NULL) {
 468                         aio_workq_remove_entry_locked(queue, entryp);
 469                         aio_workq_unlock(queue);
 470                         return 1;
 471                 }  else {
 472                         aio_workq_unlock(queue);
 473                 }
 474         }
 475
 476         return 0;
 477 }
 478
 479 static void
 480 aio_workq_lock_spin(aio_workq_t wq)
 481 {
 482         lck_mtx_lock_spin(aio_workq_mutex(wq));
 483 }
 484
 485 static void
 486 aio_workq_unlock(aio_workq_t wq)
 487 {
 488         lck_mtx_unlock(aio_workq_mutex(wq));
 489 }
 490
 491 static lck_mtx_t*
 492 aio_workq_mutex(aio_workq_t wq)
 493 {
 494         return &wq->aioq_mtx;
 495 }
 496
 497 /*
 498  * aio_cancel - attempt to cancel one or more async IO requests currently
 499  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 500  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 501  * is NULL then all outstanding async IO request for the given file
 502  * descriptor are cancelled (if possible).
 503  */
 504 int
 505 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
 506 {
 507         struct user_aiocb               my_aiocb;
 508         int                                                     result;
 509
 510         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 511                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 512
 513         /* quick check to see if there are any async IO requests queued up */
 514         if (aio_get_all_queues_count() < 1) {
 515                 result = 0;
 516                 *retval = AIO_ALLDONE;
 517                 goto ExitRoutine;
 518         }
 519
 520         *retval = -1;
 521         if ( uap->aiocbp != USER_ADDR_NULL ) {
 522                 if ( proc_is64bit(p) ) {
 523                         struct user64_aiocb aiocb64;
 524
 525                         result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
 526                         if (result == 0 )
 527                                 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
 528
 529                 } else {
 530                         struct user32_aiocb aiocb32;
 531
 532                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
 533                         if ( result == 0 )
 534                                 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
 535                 }
 536
 537                 if ( result != 0 ) {
 538                         result = EAGAIN;
 539                         goto ExitRoutine;
 540                 }
 541
 542                 /* NOTE - POSIX standard says a mismatch between the file */
 543                 /* descriptor passed in and the file descriptor embedded in */
 544                 /* the aiocb causes unspecified results.  We return EBADF in */
 545                 /* that situation.  */
 546                 if ( uap->fd != my_aiocb.aio_fildes ) {
 547                         result = EBADF;
 548                         goto ExitRoutine;
 549                 }
 550         }
 551
 552         aio_proc_lock(p);
 553         result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
 554         ASSERT_AIO_PROC_LOCK_OWNED(p);
 555         aio_proc_unlock(p);
 556
 557         if ( result != -1 ) {
 558                 *retval = result;
 559                 result = 0;
 560                 goto ExitRoutine;
 561         }
 562
 563         result = EBADF;
 564
 565 ExitRoutine:
 566         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 567                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 568
 569         return( result );
 570
 571 } /* aio_cancel */
 572
 573
 574 /*
 575  * _aio_close - internal function used to clean up async IO requests for
 576  * a file descriptor that is closing.
 577  * THIS MAY BLOCK.
 578  */
 579 __private_extern__ void
 580 _aio_close(proc_t p, int fd )
 581 {
 582         int                     error;
 583
 584         /* quick check to see if there are any async IO requests queued up */
 585         if (aio_get_all_queues_count() < 1) {
 586                 return;
 587         }
 588
 589         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 590                           (int)p, fd, 0, 0, 0 );
 591
 592         /* cancel all async IO requests on our todo queues for this file descriptor */
 593         aio_proc_lock(p);
 594         error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
 595         ASSERT_AIO_PROC_LOCK_OWNED(p);
 596         if ( error == AIO_NOTCANCELED ) {
 597                 /*
 598                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 599                  * and file descriptor on the active async IO queue.  Active requests cannot
 600                  * be cancelled so we must wait for them to complete.  We will get a special
 601                  * wake up call on our channel used to sleep for ALL active requests to
 602                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 603                  * when we must wait for all active aio requests.
 604                  */
 605
 606                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 607                                   (int)p, fd, 0, 0, 0 );
 608
 609                 while (aio_proc_active_requests_for_file(p, fd) > 0) {
 610                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 );
 611                 }
 612
 613         }
 614
 615         aio_proc_unlock(p);
 616
 617         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 618                           (int)p, fd, 0, 0, 0 );
 619
 620         return;
 621
 622 } /* _aio_close */
 623
 624
 625 /*
 626  * aio_error - return the error status associated with the async IO
 627  * request referred to by uap->aiocbp.  The error status is the errno
 628  * value that would be set by the corresponding IO request (read, wrtie,
 629  * fdatasync, or sync).
 630  */
 631 int
 632 aio_error(proc_t p, struct aio_error_args *uap, int *retval )
 633 {
 634         aio_workq_entry                         *entryp;
 635         int                                                     error;
 636
 637         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 638                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 639
 640         /* see if there are any aios to check */
 641         if (aio_get_all_queues_count() < 1) {
 642                 return EINVAL;
 643         }
 644
 645         aio_proc_lock(p);
 646
 647         /* look for a match on our queue of async IO requests that have completed */
 648         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 649                 if ( entryp->uaiocbp == uap->aiocbp ) {
 650                         ASSERT_AIO_FROM_PROC(entryp, p);
 651
 652                         aio_entry_lock_spin(entryp);
 653                         *retval = entryp->errorval;
 654                         error = 0;
 655                         aio_entry_unlock(entryp);
 656                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 657                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 658                         goto ExitRoutine;
 659                 }
 660         }
 661
 662         /* look for a match on our queue of active async IO requests */
 663         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 664                 if ( entryp->uaiocbp == uap->aiocbp ) {
 665                         ASSERT_AIO_FROM_PROC(entryp, p);
 666                         *retval = EINPROGRESS;
 667                         error = 0;
 668                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 669                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 670                         goto ExitRoutine;
 671                 }
 672         }
 673
 674         error = EINVAL;
 675
 676 ExitRoutine:
 677         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 678                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 679         aio_proc_unlock(p);
 680
 681         return( error );
 682
 683 } /* aio_error */
 684
 685
 686 /*
 687  * aio_fsync - asynchronously force all IO operations associated
 688  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 689  * queued at the time of the call to the synchronized completion state.
 690  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 691  * fdatasync() call.
 692  */
 693 int
 694 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
 695 {
 696         int                     error;
 697         int                     fsync_kind;
 698
 699         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 700                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 701
 702         *retval = 0;
 703         /* 0 := O_SYNC for binary backward compatibility with Panther */
 704         if (uap->op == O_SYNC || uap->op == 0)
 705                 fsync_kind = AIO_FSYNC;
 706         else if ( uap->op == O_DSYNC )
 707                 fsync_kind = AIO_DSYNC;
 708         else {
 709                 *retval = -1;
 710                 error = EINVAL;
 711                 goto ExitRoutine;
 712         }
 713
 714         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 715         if ( error != 0 )
 716                 *retval = -1;
 717
 718 ExitRoutine:
 719         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 720                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 721
 722         return( error );
 723
 724 } /* aio_fsync */
 725
 726
 727 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 728  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 729  * (uap->aiocbp->aio_buf).
 730  */
 731 int
 732 aio_read(proc_t p, struct aio_read_args *uap, int *retval )
 733 {
 734         int                     error;
 735
 736         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 737                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 738
 739         *retval = 0;
 740
 741         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 742         if ( error != 0 )
 743                 *retval = -1;
 744
 745         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 746                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 747
 748         return( error );
 749
 750 } /* aio_read */
 751
 752
 753 /*
 754  * aio_return - return the return status associated with the async IO
 755  * request referred to by uap->aiocbp.  The return status is the value
 756  * that would be returned by corresponding IO request (read, write,
 757  * fdatasync, or sync).  This is where we release kernel resources
 758  * held for async IO call associated with the given aiocb pointer.
 759  */
 760 int
 761 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
 762 {
 763         aio_workq_entry                         *entryp;
 764         int                                                     error;
 765         boolean_t                                       proc_lock_held = FALSE;
 766
 767         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 768                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 769
 770         /* See if there are any entries to check */
 771         if (aio_get_all_queues_count() < 1) {
 772                 error = EINVAL;
 773                 goto ExitRoutine;
 774         }
 775
 776         aio_proc_lock(p);
 777         proc_lock_held = TRUE;
 778         *retval = 0;
 779
 780         /* look for a match on our queue of async IO requests that have completed */
 781         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 782                 ASSERT_AIO_FROM_PROC(entryp, p);
 783                 if ( entryp->uaiocbp == uap->aiocbp ) {
 784                         /* Done and valid for aio_return(), pull it off the list */
 785                         aio_proc_remove_done_locked(p, entryp);
 786
 787                         /* Drop the proc lock, but keep the entry locked */
 788                         aio_entry_lock(entryp);
 789                         aio_proc_unlock(p);
 790                         proc_lock_held = FALSE;
 791
 792                         *retval = entryp->returnval;
 793                         error = 0;
 794
 795                         /* No references and off all lists, safe to free */
 796                         if (entryp->aio_refcount == 0) {
 797                                 aio_entry_unlock(entryp);
 798                                 aio_free_request(entryp);
 799                         }
 800                         else {
 801                                 /* Whoever has the refcount will have to free it */
 802                                 entryp->flags |= AIO_DO_FREE;
 803                                 aio_entry_unlock(entryp);
 804                         }
 805
 806
 807                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 808                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 809                         goto ExitRoutine;
 810                 }
 811         }
 812
 813         /* look for a match on our queue of active async IO requests */
 814         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 815                 ASSERT_AIO_FROM_PROC(entryp, p);
 816                 if ( entryp->uaiocbp == uap->aiocbp ) {
 817                         error = EINPROGRESS;
 818                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 819                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 820                         goto ExitRoutine;
 821                 }
 822         }
 823
 824         error = EINVAL;
 825
 826 ExitRoutine:
 827         if (proc_lock_held)
 828                 aio_proc_unlock(p);
 829         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 830                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 831
 832         return( error );
 833
 834 } /* aio_return */
 835
 836
 837 /*
 838  * _aio_exec - internal function used to clean up async IO requests for
 839  * a process that is going away due to exec().  We cancel any async IOs
 840  * we can and wait for those already active.  We also disable signaling
 841  * for cancelled or active aio requests that complete.
 842  * This routine MAY block!
 843  */
 844 __private_extern__ void
 845 _aio_exec(proc_t p )
 846 {
 847
 848         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 849                           (int)p, 0, 0, 0, 0 );
 850
 851         _aio_exit( p );
 852
 853         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 854                           (int)p, 0, 0, 0, 0 );
 855
 856         return;
 857
 858 } /* _aio_exec */
 859
 860
 861 /*
 862  * _aio_exit - internal function used to clean up async IO requests for
 863  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 864  * we can and wait for those already active.  We also disable signaling
 865  * for cancelled or active aio requests that complete.  This routine MAY block!
 866  */
 867 __private_extern__ void
 868 _aio_exit(proc_t p )
 869 {
 870         int                                             error;
 871         aio_workq_entry                 *entryp;
 872
 873
 874         /* quick check to see if there are any async IO requests queued up */
 875         if (aio_get_all_queues_count() < 1) {
 876                 return;
 877         }
 878
 879         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 880                           (int)p, 0, 0, 0, 0 );
 881
 882         aio_proc_lock(p);
 883
 884         /*
 885          * cancel async IO requests on the todo work queue and wait for those
 886          * already active to complete.
 887          */
 888         error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
 889         ASSERT_AIO_PROC_LOCK_OWNED(p);
 890         if ( error == AIO_NOTCANCELED ) {
 891                 /*
 892                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 893                  * on the active async IO queue.  Active requests cannot be cancelled so we
 894                  * must wait for them to complete.  We will get a special wake up call on
 895                  * our channel used to sleep for ALL active requests to complete.  This sleep
 896                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 897                  * active aio requests.
 898                  */
 899
 900                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 901                                   (int)p, 0, 0, 0, 0 );
 902
 903                 while (p->p_aio_active_count != 0) {
 904                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
 905                 }
 906         }
 907
 908         if (p->p_aio_active_count != 0) {
 909                 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
 910         }
 911
 912         /* release all aio resources used by this process */
 913         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 914         while ( entryp != NULL ) {
 915                 ASSERT_AIO_FROM_PROC(entryp, p);
 916                 aio_workq_entry                 *next_entryp;
 917
 918                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 919                 aio_proc_remove_done_locked(p, entryp);
 920
 921                 /* we cannot free requests that are still completing */
 922                 aio_entry_lock_spin(entryp);
 923                 if (entryp->aio_refcount == 0) {
 924                         aio_proc_unlock(p);
 925                         aio_entry_unlock(entryp);
 926                         aio_free_request(entryp);
 927
 928                         /* need to start over since aio_doneq may have been */
 929                         /* changed while we were away.  */
 930                         aio_proc_lock(p);
 931                         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 932                         continue;
 933                 }
 934                 else {
 935                         /* whoever has the reference will have to do the free */
 936                         entryp->flags |= AIO_DO_FREE;
 937                 }
 938
 939                 aio_entry_unlock(entryp);
 940                 entryp = next_entryp;
 941         }
 942
 943         aio_proc_unlock(p);
 944
 945         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 946                           (int)p, 0, 0, 0, 0 );
 947         return;
 948
 949 } /* _aio_exit */
 950
 951
 952 static boolean_t
 953 should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
 954 {
 955         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 956                         (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 957                         (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 958                 return TRUE;
 959         }
 960
 961         return FALSE;
 962 }
 963
 964 /*
 965  * do_aio_cancel_locked - cancel async IO requests (if possible).  We get called by
 966  * aio_cancel, close, and at exit.
 967  * There are three modes of operation: 1) cancel all async IOs for a process -
 968  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 969  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 970  * aiocbp.
 971  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 972  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 973  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 974  * were already complete.
 975  * WARNING - do not deference aiocbp in this routine, it may point to user
 976  * land data that has not been copied in (when called from aio_cancel() )
 977  *
 978  * Called with proc locked, and returns the same way.
 979  */
 980 static int
 981 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
 982         int wait_for_completion, boolean_t disable_notification )
 983 {
 984         ASSERT_AIO_PROC_LOCK_OWNED(p);
 985
 986         aio_workq_entry                 *entryp;
 987         int                                             result;
 988
 989         result = -1;
 990
 991         /* look for a match on our queue of async todo work. */
 992         entryp = TAILQ_FIRST(&p->p_aio_activeq);
 993         while ( entryp != NULL ) {
 994                 ASSERT_AIO_FROM_PROC(entryp, p);
 995                 aio_workq_entry                 *next_entryp;
 996
 997                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 998                 if (!should_cancel(entryp, aiocbp, fd)) {
 999                         entryp = next_entryp;
1000                         continue;
1001                 }
1002
1003                 /* Can only be cancelled if it's still on a work queue */
1004                 if (aio_entry_try_workq_remove(entryp) != 0) {
1005                         /* Have removed from workq. Update entry state and take a ref */
1006                         aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
1007
1008                         /* Put on the proc done queue and update counts, then unlock the proc */
1009                         aio_proc_move_done_locked(p, entryp);
1010                         aio_proc_unlock(p);
1011
1012                         /* Now it's officially cancelled.  Do the completion */
1013                         result = AIO_CANCELED;
1014                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1015                                         (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1016                         do_aio_completion(entryp);
1017
1018                         /* This will free if the aio_return() has already happened ... */
1019                         aio_entry_unref(entryp);
1020                         aio_proc_lock(p);
1021
1022                         if ( aiocbp != USER_ADDR_NULL ) {
1023                                 return( result );
1024                         }
1025
1026                         /*
1027                          * Restart from the head of the proc active queue since it
1028                          * may have been changed while we were away doing completion
1029                          * processing.
1030                          *
1031                          * Note that if we found an uncancellable AIO before, we will
1032                          * either find it again or discover that it's been completed,
1033                          * so resetting the result will not cause us to return success
1034                          * despite outstanding AIOs.
1035                          */
1036                         entryp = TAILQ_FIRST(&p->p_aio_activeq);
1037                         result = -1; /* As if beginning anew */
1038                 } else {
1039                         /*
1040                          * It's been taken off the active queue already, i.e. is in flight.
1041                          * All we can do is ask for notification.
1042                          */
1043                         result = AIO_NOTCANCELED;
1044
1045                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
1046                                         (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1047
1048                         /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1049                         aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
1050
1051                         if ( aiocbp != USER_ADDR_NULL ) {
1052                                 return( result );
1053                         }
1054                         entryp = next_entryp;
1055                 }
1056         } /* while... */
1057
1058         /*
1059          * if we didn't find any matches on the todo or active queues then look for a
1060          * match on our queue of async IO requests that have completed and if found
1061          * return AIO_ALLDONE result.
1062          *
1063          * Proc AIO lock is still held.
1064          */
1065         if ( result == -1 ) {
1066                 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1067                         ASSERT_AIO_FROM_PROC(entryp, p);
1068                         if (should_cancel(entryp, aiocbp, fd)) {
1069                                 result = AIO_ALLDONE;
1070                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
1071                                                 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1072
1073                                 if ( aiocbp != USER_ADDR_NULL ) {
1074                                         return( result );
1075                                 }
1076                         }
1077                 }
1078         }
1079
1080         return( result );
1081
1082 }
1083  /* do_aio_cancel_locked */
1084
1085
1086 /*
1087  * aio_suspend - suspend the calling thread until at least one of the async
1088  * IO operations referenced by uap->aiocblist has completed, until a signal
1089  * interrupts the function, or uap->timeoutp time interval (optional) has
1090  * passed.
1091  * Returns 0 if one or more async IOs have completed else -1 and errno is
1092  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1093  * woke us up.
1094  */
1095 int
1096 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1097 {
1098         __pthread_testcancel(1);
1099         return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
1100 }
1101
1102
1103 int
1104 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
1105 {
1106         int                                     error;
1107         int                                     i, count;
1108         uint64_t                        abstime;
1109         struct user_timespec ts;
1110         aio_workq_entry         *entryp;
1111         user_addr_t                     *aiocbpp;
1112
1113         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1114                           (int)p, uap->nent, 0, 0, 0 );
1115
1116         *retval = -1;
1117         abstime = 0;
1118         aiocbpp = NULL;
1119
1120         count = aio_get_all_queues_count( );
1121         if ( count < 1 ) {
1122                 error = EINVAL;
1123                 goto ExitThisRoutine;
1124         }
1125
1126         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
1127                 error = EINVAL;
1128                 goto ExitThisRoutine;
1129         }
1130
1131         if ( uap->timeoutp != USER_ADDR_NULL ) {
1132                 if ( proc_is64bit(p) ) {
1133                         struct user64_timespec temp;
1134                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1135                         if ( error == 0 ) {
1136                                 ts.tv_sec = temp.tv_sec;
1137                                 ts.tv_nsec = temp.tv_nsec;
1138                         }
1139                 }
1140                 else {
1141                         struct user32_timespec temp;
1142                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1143                         if ( error == 0 ) {
1144                                 ts.tv_sec = temp.tv_sec;
1145                                 ts.tv_nsec = temp.tv_nsec;
1146                         }
1147                 }
1148                 if ( error != 0 ) {
1149                         error = EAGAIN;
1150                         goto ExitThisRoutine;
1151                 }
1152
1153                 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
1154                         error = EINVAL;
1155                         goto ExitThisRoutine;
1156                 }
1157
1158                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1159                                                                          &abstime );
1160                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
1161         }
1162
1163         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1164         if ( aiocbpp == NULL ) {
1165                 error = EAGAIN;
1166                 goto ExitThisRoutine;
1167         }
1168
1169         /* check list of aio requests to see if any have completed */
1170 check_for_our_aiocbp:
1171         aio_proc_lock_spin(p);
1172         for ( i = 0; i < uap->nent; i++ ) {
1173                 user_addr_t     aiocbp;
1174
1175                 /* NULL elements are legal so check for 'em */
1176                 aiocbp = *(aiocbpp + i);
1177                 if ( aiocbp == USER_ADDR_NULL )
1178                         continue;
1179
1180                 /* return immediately if any aio request in the list is done */
1181                 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1182                         ASSERT_AIO_FROM_PROC(entryp, p);
1183                         if ( entryp->uaiocbp == aiocbp ) {
1184                                 aio_proc_unlock(p);
1185                                 *retval = 0;
1186                                 error = 0;
1187                                 goto ExitThisRoutine;
1188                         }
1189                 }
1190         } /* for ( ; i < uap->nent; ) */
1191
1192         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1193                           (int)p, uap->nent, 0, 0, 0 );
1194
1195         /*
1196          * wait for an async IO to complete or a signal fires or timeout expires.
1197          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1198          * interrupts us.  If an async IO completes before a signal fires or our
1199          * timeout expires, we get a wakeup call from aio_work_thread().
1200          */
1201
1202         error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
1203         if ( error == 0 ) {
1204                 /*
1205                  * got our wakeup call from aio_work_thread().
1206                  * Since we can get a wakeup on this channel from another thread in the
1207                  * same process we head back up to make sure this is for the correct aiocbp.
1208                  * If it is the correct aiocbp we will return from where we do the check
1209                  * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1210                  * else we will fall out and just sleep again.
1211                  */
1212                 goto check_for_our_aiocbp;
1213         }
1214         else if ( error == EWOULDBLOCK ) {
1215                 /* our timeout expired */
1216                 error = EAGAIN;
1217         }
1218         else {
1219                 /* we were interrupted */
1220                 error = EINTR;
1221         }
1222
1223 ExitThisRoutine:
1224         if ( aiocbpp != NULL )
1225                 FREE( aiocbpp, M_TEMP );
1226
1227         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1228                           (int)p, uap->nent, error, 0, 0 );
1229
1230         return( error );
1231
1232 } /* aio_suspend */
1233
1234
1235 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1236  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1237  * (uap->aiocbp->aio_buf).
1238  */
1239
1240 int
1241 aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1242 {
1243         int                     error;
1244
1245         *retval = 0;
1246
1247         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1248                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1249
1250         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1251         if ( error != 0 )
1252                 *retval = -1;
1253
1254         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1255                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1256
1257         return( error );
1258
1259 } /* aio_write */
1260
1261
1262 static user_addr_t *
1263 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
1264 {
1265         user_addr_t     *aiocbpp;
1266         int             i, result;
1267
1268         /* we reserve enough space for largest possible pointer size */
1269         MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1270         if ( aiocbpp == NULL )
1271                 goto err;
1272
1273         /* copyin our aiocb pointers from list */
1274         result = copyin( aiocblist, aiocbpp,
1275                         proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1276                                             : (nent * sizeof(user32_addr_t)) );
1277         if ( result) {
1278                 FREE( aiocbpp, M_TEMP );
1279                 aiocbpp = NULL;
1280                 goto err;
1281         }
1282
1283         /*
1284          * We depend on a list of user_addr_t's so we need to
1285          * munge and expand when these pointers came from a
1286          * 32-bit process
1287          */
1288         if ( !proc_is64bit(procp) ) {
1289                 /* copy from last to first to deal with overlap */
1290                 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1291                 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1292
1293                 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1294                         *my_addrp = (user_addr_t) (*my_ptrp);
1295                 }
1296         }
1297
1298 err:
1299         return (aiocbpp);
1300 }
1301
1302
1303 static int
1304 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1305 {
1306         int     result = 0;
1307
1308         if (sigp == USER_ADDR_NULL)
1309                 goto out;
1310
1311         /*
1312          * We need to munge aio_sigevent since it contains pointers.
1313          * Since we do not know if sigev_value is an int or a ptr we do
1314          * NOT cast the ptr to a user_addr_t.   This means if we send
1315          * this info back to user space we need to remember sigev_value
1316          * was not expanded for the 32-bit case.
1317          *
1318          * Notes:        This does NOT affect us since we don't support
1319          *              sigev_value yet in the aio context.
1320          */
1321         if ( proc_is64bit(procp) ) {
1322                 struct user64_sigevent sigevent64;
1323
1324                 result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
1325                 if ( result == 0 ) {
1326                         sigev->sigev_notify = sigevent64.sigev_notify;
1327                         sigev->sigev_signo = sigevent64.sigev_signo;
1328                         sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1329                         sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1330                         sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1331                 }
1332
1333         } else {
1334                 struct user32_sigevent sigevent32;
1335
1336                 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1337                 if ( result == 0 ) {
1338                         sigev->sigev_notify = sigevent32.sigev_notify;
1339                         sigev->sigev_signo = sigevent32.sigev_signo;
1340                         sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1341                         sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1342                         sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1343                 }
1344         }
1345
1346         if ( result != 0 ) {
1347                 result = EAGAIN;
1348         }
1349
1350 out:
1351         return (result);
1352 }
1353
1354 /*
1355  * aio_enqueue_work
1356  *
1357  * Queue up the entry on the aio asynchronous work queue in priority order
1358  * based on the relative priority of the request.  We calculate the relative
1359  * priority using the nice value of the caller and the value
1360  *
1361  * Parameters:  procp                   Process queueing the I/O
1362  *              entryp                  The work queue entry being queued
1363  *
1364  * Returns:     (void)                  No failure modes
1365  *
1366  * Notes:       This function is used for both lio_listio and aio
1367  *
1368  * XXX:         At some point, we may have to consider thread priority
1369  *              rather than process priority, but we don't maintain the
1370  *              adjusted priority for threads the POSIX way.
1371  *
1372  *
1373  * Called with proc locked.
1374  */
1375 static void
1376 aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1377 {
1378 #if 0
1379         aio_workq_entry *my_entryp;     /* used for insertion sort */
1380 #endif /* 0 */
1381         aio_workq_t queue = aio_entry_workq(entryp);
1382
1383         if (proc_locked == 0) {
1384                 aio_proc_lock(procp);
1385         }
1386
1387         ASSERT_AIO_PROC_LOCK_OWNED(procp);
1388
1389         /* Onto proc queue */
1390         TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp,  aio_proc_link);
1391         procp->p_aio_active_count++;
1392         procp->p_aio_total_count++;
1393
1394         /* And work queue */
1395         aio_workq_lock_spin(queue);
1396         aio_workq_add_entry_locked(queue, entryp);
1397         waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1398                            THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
1399         aio_workq_unlock(queue);
1400
1401         if (proc_locked == 0) {
1402                 aio_proc_unlock(procp);
1403         }
1404
1405 #if 0
1406         /*
1407          * Procedure:
1408          *
1409          * (1)  The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1410          * (2)  The normalized nice value is in the range 0..((2 * NZERO) - 1)
1411          *      which is [0..39], with 0 not being used.  In nice values, the
1412          *      lower the nice value, the higher the priority.
1413          * (3)  The normalized scheduling prioritiy is the highest nice value
1414          *      minus the current nice value.  In I/O scheduling priority, the
1415          *      higher the value the lower the priority, so it is the inverse
1416          *      of the nice value (the higher the number, the higher the I/O
1417          *      priority).
1418          * (4)  From the normalized scheduling priority, we subtract the
1419          *      request priority to get the request priority value number;
1420          *      this means that requests are only capable of depressing their
1421          *      priority relative to other requests,
1422          */
1423         entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1424
1425         /* only premit depressing the priority */
1426         if (entryp->aiocb.aio_reqprio < 0)
1427                 entryp->aiocb.aio_reqprio = 0;
1428         if (entryp->aiocb.aio_reqprio > 0) {
1429                 entryp->priority -= entryp->aiocb.aio_reqprio;
1430                 if (entryp->priority < 0)
1431                         entryp->priority = 0;
1432         }
1433
1434         /* Insertion sort the entry; lowest ->priority to highest */
1435         TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1436                 if ( entryp->priority <= my_entryp->priority) {
1437                         TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1438                         break;
1439                 }
1440         }
1441         if (my_entryp == NULL)
1442                 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1443 #endif /* 0 */
1444 }
1445
1446
1447 /*
1448  * lio_listio - initiate a list of IO requests.  We process the list of
1449  * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1450  * (mode == LIO_NOWAIT).
1451  *
1452  * The caller gets error and return status for each aiocb in the list
1453  * via aio_error and aio_return.  We must keep completed requests until
1454  * released by the aio_return call.
1455  */
1456 int
1457 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1458 {
1459         int                             i;
1460         int                             call_result;
1461         int                             result;
1462         int                             old_count;
1463         aio_workq_entry                 **entryp_listp;
1464         user_addr_t                     *aiocbpp;
1465         struct user_sigevent            aiosigev;
1466         aio_lio_context         *lio_context;
1467         boolean_t                       free_context = FALSE;
1468
1469         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1470                           (int)p, uap->nent, uap->mode, 0, 0 );
1471
1472         entryp_listp = NULL;
1473         lio_context = NULL;
1474         aiocbpp = NULL;
1475         call_result = -1;
1476         *retval = -1;
1477         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1478                 call_result = EINVAL;
1479                 goto ExitRoutine;
1480         }
1481
1482         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1483                 call_result = EINVAL;
1484                 goto ExitRoutine;
1485         }
1486
1487         /*
1488          * allocate a list of aio_workq_entry pointers that we will use
1489          * to queue up all our requests at once while holding our lock.
1490          */
1491         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1492         if ( entryp_listp == NULL ) {
1493                 call_result = EAGAIN;
1494                 goto ExitRoutine;
1495         }
1496
1497         MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1498         if ( lio_context == NULL ) {
1499                 call_result = EAGAIN;
1500                 goto ExitRoutine;
1501         }
1502
1503 #if DEBUG
1504         OSIncrementAtomic(&lio_contexts_alloced);
1505 #endif /* DEBUG */
1506
1507         bzero(lio_context, sizeof(aio_lio_context));
1508
1509         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1510         if ( aiocbpp == NULL ) {
1511                 call_result = EAGAIN;
1512                 goto ExitRoutine;
1513         }
1514
1515         /*
1516          * Use sigevent passed in to lio_listio for each of our calls, but
1517          * only do completion notification after the last request completes.
1518          */
1519         bzero(&aiosigev, sizeof(aiosigev));
1520         /* Only copy in an sigev if the user supplied one */
1521         if (uap->sigp != USER_ADDR_NULL) {
1522                 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1523                 if ( call_result)
1524                         goto ExitRoutine;
1525         }
1526
1527         /* process list of aio requests */
1528         lio_context->io_issued = uap->nent;
1529         lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
1530         for ( i = 0; i < uap->nent; i++ ) {
1531                 user_addr_t my_aiocbp;
1532                 aio_workq_entry                         *entryp;
1533
1534                 *(entryp_listp + i) = NULL;
1535                 my_aiocbp = *(aiocbpp + i);
1536
1537                 /* NULL elements are legal so check for 'em */
1538                 if ( my_aiocbp == USER_ADDR_NULL ) {
1539                         aio_proc_lock_spin(p);
1540                         lio_context->io_issued--;
1541                         aio_proc_unlock(p);
1542                         continue;
1543                 }
1544
1545                 /*
1546                  * We use lio_context to mark IO requests for delayed completion
1547                  * processing which means we wait until all IO requests in the
1548                  * group have completed before we either return to the caller
1549                  * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1550                  *
1551                  * We use the address of the lio_context for this, since it is
1552                  * unique in the address space.
1553                  */
1554                 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
1555                 if ( result != 0 && call_result == -1 )
1556                         call_result = result;
1557
1558                 /* NULL elements are legal so check for 'em */
1559                 entryp = *(entryp_listp + i);
1560                 if ( entryp == NULL ) {
1561                         aio_proc_lock_spin(p);
1562                         lio_context->io_issued--;
1563                         aio_proc_unlock(p);
1564                         continue;
1565                 }
1566
1567                 if ( uap->mode == LIO_NOWAIT ) {
1568                         /* Set signal hander, if any */
1569                         entryp->aiocb.aio_sigevent = aiosigev;
1570                 } else {
1571                         /* flag that this thread blocks pending completion */
1572                         entryp->flags |= AIO_LIO_NOTIFY;
1573                 }
1574
1575                 /* check our aio limits to throttle bad or rude user land behavior */
1576                 old_count = aio_increment_total_count();
1577
1578                 aio_proc_lock_spin(p);
1579                 if ( old_count >= aio_max_requests ||
1580                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1581                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1582
1583                         lio_context->io_issued--;
1584                         aio_proc_unlock(p);
1585
1586                         aio_decrement_total_count();
1587
1588                         if ( call_result == -1 )
1589                                 call_result = EAGAIN;
1590                         aio_free_request(entryp);
1591                         entryp_listp[i] = NULL;
1592                         continue;
1593                 }
1594
1595                 lck_mtx_convert_spin(aio_proc_mutex(p));
1596                 aio_enqueue_work(p, entryp, 1);
1597                 aio_proc_unlock(p);
1598
1599                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1600                                   (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1601         }
1602
1603         switch(uap->mode) {
1604         case LIO_WAIT:
1605                 aio_proc_lock_spin(p);
1606                 while (lio_context->io_completed < lio_context->io_issued) {
1607                         result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
1608
1609                         /* If we were interrupted, fail out (even if all finished) */
1610                         if (result != 0) {
1611                                 call_result = EINTR;
1612                                 lio_context->io_waiter = 0;
1613                                 break;
1614                         }
1615                 }
1616
1617                 /* If all IOs have finished must free it */
1618                 if (lio_context->io_completed == lio_context->io_issued) {
1619                         free_context = TRUE;
1620                 }
1621
1622                 aio_proc_unlock(p);
1623                 break;
1624
1625         case LIO_NOWAIT:
1626                 break;
1627         }
1628
1629         /* call_result == -1 means we had no trouble queueing up requests */
1630         if ( call_result == -1 ) {
1631                 call_result = 0;
1632                 *retval = 0;
1633         }
1634
1635 ExitRoutine:
1636         if ( entryp_listp != NULL )
1637                 FREE( entryp_listp, M_TEMP );
1638         if ( aiocbpp != NULL )
1639                 FREE( aiocbpp, M_TEMP );
1640         if ((lio_context != NULL) && ((lio_context->io_issued == 0) || (free_context == TRUE))) {
1641                 free_lio_context(lio_context);
1642         }
1643
1644         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1645                           (int)p, call_result, 0, 0, 0 );
1646
1647         return( call_result );
1648
1649 } /* lio_listio */
1650
1651
1652 /*
1653  * aio worker thread.  this is where all the real work gets done.
1654  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1655  * after new work is queued up.
1656  */
1657 __attribute__((noreturn))
1658 static void
1659 aio_work_thread(void)
1660 {
1661         aio_workq_entry                 *entryp;
1662         int                     error;
1663         vm_map_t                currentmap;
1664         vm_map_t                oldmap = VM_MAP_NULL;
1665         task_t                  oldaiotask = TASK_NULL;
1666         struct uthread  *uthreadp = NULL;
1667
1668         for( ;; ) {
1669                 /*
1670                  * returns with the entry ref'ed.
1671                  * sleeps until work is available.
1672                  */
1673                 entryp = aio_get_some_work();
1674
1675                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1676                                 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1677
1678                 /*
1679                  * Assume the target's address space identity for the duration
1680                  * of the IO.  Note: don't need to have the entryp locked,
1681                  * because the proc and map don't change until it's freed.
1682                  */
1683                 currentmap = get_task_map( (current_proc())->task );
1684                 if ( currentmap != entryp->aio_map ) {
1685                         uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1686                         oldaiotask = uthreadp->uu_aio_task;
1687                         uthreadp->uu_aio_task = entryp->procp->task;
1688                         oldmap = vm_map_switch( entryp->aio_map );
1689                 }
1690
1691                 if ( (entryp->flags & AIO_READ) != 0 ) {
1692                         error = do_aio_read( entryp );
1693                 }
1694                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1695                         error = do_aio_write( entryp );
1696                 }
1697                 else if ( (entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0 ) {
1698                         error = do_aio_fsync( entryp );
1699                 }
1700                 else {
1701                         printf( "%s - unknown aio request - flags 0x%02X \n",
1702                                         __FUNCTION__, entryp->flags );
1703                         error = EINVAL;
1704                 }
1705
1706                 /* Restore old map */
1707                 if ( currentmap != entryp->aio_map ) {
1708                         (void) vm_map_switch( oldmap );
1709                         uthreadp->uu_aio_task = oldaiotask;
1710                 }
1711
1712                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1713                                 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1714                                 entryp->returnval, 0 );
1715
1716
1717                 /* XXX COUNTS */
1718                 aio_entry_lock_spin(entryp);
1719                 entryp->errorval = error;
1720                 aio_entry_unlock(entryp);
1721
1722                 /* we're done with the IO request so pop it off the active queue and */
1723                 /* push it on the done queue */
1724                 aio_proc_lock(entryp->procp);
1725                 aio_proc_move_done_locked(entryp->procp, entryp);
1726                 aio_proc_unlock(entryp->procp);
1727
1728                 OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1729
1730                 /* remove our reference to the user land map. */
1731                 if ( VM_MAP_NULL != entryp->aio_map ) {
1732                         vm_map_t                my_map;
1733
1734                         my_map = entryp->aio_map;
1735                         entryp->aio_map = VM_MAP_NULL;
1736                         vm_map_deallocate( my_map );
1737                 }
1738
1739                 /* Provide notifications */
1740                 do_aio_completion( entryp );
1741
1742                 /* Will free if needed */
1743                 aio_entry_unref(entryp);
1744
1745         } /* for ( ;; ) */
1746
1747         /* NOT REACHED */
1748
1749 } /* aio_work_thread */
1750
1751
1752 /*
1753  * aio_get_some_work - get the next async IO request that is ready to be executed.
1754  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1755  * IO requests at the time the aio_fsync call came in have completed.
1756  * NOTE - AIO_LOCK must be held by caller
1757  */
1758 static aio_workq_entry *
1759 aio_get_some_work( void )
1760 {
1761         aio_workq_entry                         *entryp = NULL;
1762         aio_workq_t                             queue = NULL;
1763
1764         /* Just one queue for the moment.  In the future there will be many. */
1765         queue = &aio_anchor.aio_async_workqs[0];
1766         aio_workq_lock_spin(queue);
1767         if (queue->aioq_count == 0) {
1768                 goto nowork;
1769         }
1770
1771         /*
1772          * Hold the queue lock.
1773          *
1774          * pop some work off the work queue and add to our active queue
1775          * Always start with the queue lock held.
1776          */
1777         for(;;) {
1778                 /*
1779                  * Pull of of work queue.  Once it's off, it can't be cancelled,
1780                  * so we can take our ref once we drop the queue lock.
1781                  */
1782                 entryp = TAILQ_FIRST(&queue->aioq_entries);
1783
1784                 /*
1785                  * If there's no work or only fsyncs that need delay, go to sleep
1786                  * and then start anew from aio_work_thread
1787                  */
1788                 if (entryp == NULL) {
1789                         goto nowork;
1790                 }
1791
1792                 aio_workq_remove_entry_locked(queue, entryp);
1793
1794                 aio_workq_unlock(queue);
1795
1796                 /*
1797                  * Check if it's an fsync that must be delayed.  No need to lock the entry;
1798                  * that flag would have been set at initialization.
1799                  */
1800                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1801                         /*
1802                          * Check for unfinished operations on the same file
1803                          * in this proc's queue.
1804                          */
1805                         aio_proc_lock_spin(entryp->procp);
1806                         if ( aio_delay_fsync_request( entryp ) ) {
1807                                 /* It needs to be delayed.  Put it back on the end of the work queue */
1808                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1809                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1810
1811                                 aio_proc_unlock(entryp->procp);
1812
1813                                 aio_workq_lock_spin(queue);
1814                                 aio_workq_add_entry_locked(queue, entryp);
1815                                 continue;
1816                         }
1817                         aio_proc_unlock(entryp->procp);
1818                 }
1819
1820                 break;
1821         }
1822
1823         aio_entry_ref(entryp);
1824
1825         OSIncrementAtomic(&aio_anchor.aio_inflight_count);
1826         return( entryp );
1827
1828 nowork:
1829         /* We will wake up when someone enqueues something */
1830         waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
1831         aio_workq_unlock(queue);
1832         thread_block( (thread_continue_t)aio_work_thread );
1833
1834         // notreached
1835         return NULL;
1836 }
1837
1838 /*
1839  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1840  * A big, simple hammer: only send it off if it's the most recently filed IO which has
1841  * not been completed.
1842  */
1843 static boolean_t
1844 aio_delay_fsync_request( aio_workq_entry *entryp )
1845 {
1846         if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1847                 return FALSE;
1848         }
1849
1850         return TRUE;
1851 } /* aio_delay_fsync_request */
1852
1853 static aio_workq_entry *
1854 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
1855 {
1856         aio_workq_entry *entryp;
1857         int             result = 0;
1858
1859         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1860         if ( entryp == NULL ) {
1861                 result = EAGAIN;
1862                 goto error_exit;
1863         }
1864
1865         bzero( entryp, sizeof(*entryp) );
1866
1867         /* fill in the rest of the aio_workq_entry */
1868         entryp->procp = procp;
1869         entryp->uaiocbp = aiocbp;
1870         entryp->flags |= kindOfIO;
1871         entryp->group_tag = group_tag;
1872         entryp->aio_map = VM_MAP_NULL;
1873         entryp->aio_refcount = 0;
1874
1875         if ( proc_is64bit(procp) ) {
1876                 struct user64_aiocb aiocb64;
1877
1878                 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
1879                 if (result == 0 )
1880                         do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1881
1882         } else {
1883                 struct user32_aiocb aiocb32;
1884
1885                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1886                 if ( result == 0 )
1887                         do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
1888         }
1889
1890         if ( result != 0 ) {
1891                 result = EAGAIN;
1892                 goto error_exit;
1893         }
1894
1895         /* get a reference to the user land map in order to keep it around */
1896         entryp->aio_map = get_task_map( procp->task );
1897         vm_map_reference( entryp->aio_map );
1898
1899         /* do some more validation on the aiocb and embedded file descriptor */
1900         result = aio_validate( entryp );
1901         if ( result != 0 )
1902                 goto error_exit_with_ref;
1903
1904         /* get a reference on the current_thread, which is passed in vfs_context. */
1905         entryp->thread = current_thread();
1906         thread_reference( entryp->thread );
1907         return ( entryp );
1908
1909 error_exit_with_ref:
1910         if ( VM_MAP_NULL != entryp->aio_map ) {
1911                 vm_map_deallocate( entryp->aio_map );
1912         }
1913 error_exit:
1914         if ( result && entryp != NULL ) {
1915                 zfree( aio_workq_zonep, entryp );
1916                 entryp = NULL;
1917         }
1918
1919         return ( entryp );
1920 }
1921
1922
1923 /*
1924  * aio_queue_async_request - queue up an async IO request on our work queue then
1925  * wake up one of our worker threads to do the actual work.  We get a reference
1926  * to our caller's user land map in order to keep it around while we are
1927  * processing the request.
1928  */
1929 static int
1930 aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1931 {
1932         aio_workq_entry *entryp;
1933         int             result;
1934         int             old_count;
1935
1936         old_count = aio_increment_total_count();
1937         if (old_count >= aio_max_requests) {
1938                 result = EAGAIN;
1939                 goto error_noalloc;
1940         }
1941
1942         entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1943         if ( entryp == NULL ) {
1944                 result = EAGAIN;
1945                 goto error_noalloc;
1946         }
1947
1948
1949         aio_proc_lock_spin(procp);
1950
1951         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1952                 result = EAGAIN;
1953                 goto error_exit;
1954         }
1955
1956         /* check our aio limits to throttle bad or rude user land behavior */
1957         if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
1958                 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
1959                 result = EAGAIN;
1960                 goto error_exit;
1961         }
1962
1963         /* Add the IO to proc and work queues, wake up threads as appropriate */
1964         lck_mtx_convert_spin(aio_proc_mutex(procp));
1965         aio_enqueue_work(procp, entryp, 1);
1966
1967         aio_proc_unlock(procp);
1968
1969         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1970                           (int)procp, (int)aiocbp, 0, 0, 0 );
1971
1972         return( 0 );
1973
1974 error_exit:
1975         /*
1976          * This entry has not been queued up so no worries about
1977          * unlocked state and aio_map
1978          */
1979         aio_proc_unlock(procp);
1980         aio_free_request(entryp);
1981
1982 error_noalloc:
1983         aio_decrement_total_count();
1984
1985         return( result );
1986
1987 } /* aio_queue_async_request */
1988
1989
1990 /*
1991  * lio_create_entry
1992  *
1993  * Allocate an aio_workq_entry and fill it in.  If all goes well return 0
1994  * and pass the aio_workq_entry pointer back to our caller.
1995  *
1996  * Parameters:  procp                   The process makign the request
1997  *              aiocbp                  The aio context buffer pointer
1998  *              group_tag               The group tag used to indicate a
1999  *                                      group of operations has completed
2000  *              entrypp                 Pointer to the pointer to receive the
2001  *                                      address of the created aio_workq_entry
2002  *
2003  * Returns:     0                       Successfully created
2004  *              EAGAIN                  Try again (usually resource shortage)
2005  *
2006  *
2007  * Notes:       We get a reference to our caller's user land map in order
2008  *              to keep it around while we are processing the request.
2009  *
2010  *              lio_listio calls behave differently at completion they do
2011  *              completion notification when all async IO requests have
2012  *              completed.  We use group_tag to tag IO requests that behave
2013  *              in the delay notification manner.
2014  *
2015  *              All synchronous operations are considered to not have a
2016  *              signal routine associated with them (sigp == USER_ADDR_NULL).
2017  */
2018 static int
2019 lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2020                 aio_workq_entry **entrypp )
2021 {
2022         aio_workq_entry *entryp;
2023         int             result;
2024
2025         entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2026         if ( entryp == NULL ) {
2027                 result = EAGAIN;
2028                 goto error_exit;
2029         }
2030
2031         /*
2032          * Look for lio_listio LIO_NOP requests and ignore them; this is
2033          * not really an error, but we need to free our aio_workq_entry.
2034          */
2035         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
2036                 result = 0;
2037                 goto error_exit;
2038         }
2039
2040         *entrypp = entryp;
2041         return( 0 );
2042
2043 error_exit:
2044
2045         if ( entryp != NULL ) {
2046                 /*
2047                  * This entry has not been queued up so no worries about
2048                  * unlocked state and aio_map
2049                  */
2050                 aio_free_request(entryp);
2051         }
2052
2053         return( result );
2054
2055 } /* lio_create_entry */
2056
2057
2058 /*
2059  * aio_free_request - remove our reference on the user land map and
2060  * free the work queue entry resources.  The entry is off all lists
2061  * and has zero refcount, so no one can have a pointer to it.
2062  */
2063
2064 static int
2065 aio_free_request(aio_workq_entry *entryp)
2066 {
2067         /* remove our reference to the user land map. */
2068         if ( VM_MAP_NULL != entryp->aio_map) {
2069                 vm_map_deallocate(entryp->aio_map);
2070         }
2071
2072         /* remove our reference to thread which enqueued the request */
2073         if ( NULL != entryp->thread ) {
2074                 thread_deallocate( entryp->thread );
2075         }
2076
2077         entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2078
2079         zfree( aio_workq_zonep, entryp );
2080
2081         return( 0 );
2082
2083 } /* aio_free_request */
2084
2085
2086 /*
2087  * aio_validate
2088  *
2089  * validate the aiocb passed in by one of the aio syscalls.
2090  */
2091 static int
2092 aio_validate( aio_workq_entry *entryp )
2093 {
2094         struct fileproc                                 *fp;
2095         int                                                     flag;
2096         int                                                     result;
2097
2098         result = 0;
2099
2100         if ( (entryp->flags & AIO_LIO) != 0 ) {
2101                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
2102                         entryp->flags |= AIO_READ;
2103                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
2104                         entryp->flags |= AIO_WRITE;
2105                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
2106                         return( 0 );
2107                 else
2108                         return( EINVAL );
2109         }
2110
2111         flag = FREAD;
2112         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0 ) {
2113                 flag = FWRITE;
2114         }
2115
2116         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
2117                 if ( entryp->aiocb.aio_nbytes > INT_MAX         ||
2118                          entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2119                          entryp->aiocb.aio_offset < 0 )
2120                         return( EINVAL );
2121         }
2122
2123         /*
2124          * validate aiocb.aio_sigevent.  at this point we only support
2125          * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
2126          * sigev_value, sigev_notify_function, and sigev_notify_attributes
2127          * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
2128          * with no [RTS] (RalTime Signal) option group support.
2129          */
2130         switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
2131         case SIGEV_SIGNAL:
2132             {
2133                 int             signum;
2134
2135                 /* make sure we have a valid signal number */
2136                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
2137                 if ( signum <= 0 || signum >= NSIG ||
2138                          signum == SIGKILL || signum == SIGSTOP )
2139                         return (EINVAL);
2140             }
2141             break;
2142
2143         case SIGEV_NONE:
2144                 break;
2145
2146         case SIGEV_THREAD:
2147                 /* Unsupported [RTS] */
2148
2149         default:
2150                 return (EINVAL);
2151         }
2152
2153         /* validate the file descriptor and that the file was opened
2154          * for the appropriate read / write access.
2155          */
2156         proc_fdlock(entryp->procp);
2157
2158         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
2159         if ( result == 0 ) {
2160                 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
2161                         /* we don't have read or write access */
2162                         result = EBADF;
2163                 }
2164                 else if ( FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE ) {
2165                         /* this is not a file */
2166                         result = ESPIPE;
2167                 } else
2168                         fp->f_flags |= FP_AIOISSUED;
2169
2170                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
2171         }
2172         else {
2173                 result = EBADF;
2174         }
2175
2176         proc_fdunlock(entryp->procp);
2177
2178         return( result );
2179
2180 } /* aio_validate */
2181
2182 static int
2183 aio_increment_total_count()
2184 {
2185         return OSIncrementAtomic(&aio_anchor.aio_total_count);
2186 }
2187
2188 static int
2189 aio_decrement_total_count()
2190 {
2191         int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2192         if (old <= 0) {
2193                 panic("Negative total AIO count!\n");
2194         }
2195
2196         return old;
2197 }
2198
2199 static int
2200 aio_get_process_count(proc_t procp )
2201 {
2202         return procp->p_aio_total_count;
2203
2204 } /* aio_get_process_count */
2205
2206 static int
2207 aio_get_all_queues_count( void )
2208 {
2209         return aio_anchor.aio_total_count;
2210
2211 } /* aio_get_all_queues_count */
2212
2213
2214 /*
2215  * do_aio_completion.  Handle async IO completion.
2216  */
2217 static void
2218 do_aio_completion( aio_workq_entry *entryp )
2219 {
2220
2221         boolean_t               lastLioCompleted = FALSE;
2222         aio_lio_context *lio_context = NULL;
2223         int waiter = 0;
2224
2225         lio_context = (aio_lio_context *)entryp->group_tag;
2226
2227         if (lio_context != NULL) {
2228
2229                 aio_proc_lock_spin(entryp->procp);
2230
2231                 /* Account for this I/O completing. */
2232                 lio_context->io_completed++;
2233
2234                 /* Are we done with this lio context? */
2235                 if (lio_context->io_issued == lio_context->io_completed) {
2236                         lastLioCompleted = TRUE;
2237                 }
2238
2239                 waiter = lio_context->io_waiter;
2240
2241                 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2242                 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2243                         /* wake up the waiter */
2244                         wakeup(lio_context);
2245                 }
2246
2247                 aio_proc_unlock(entryp->procp);
2248         }
2249
2250         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2251                  (entryp->flags & AIO_DISABLE) == 0 ) {
2252
2253                 boolean_t       performSignal = FALSE;
2254                  if (lio_context == NULL) {
2255                         performSignal = TRUE;
2256                  }
2257                  else {
2258                         /*
2259                          * If this was the last request in the group and a signal
2260                          * is desired, send one.
2261                          */
2262                         performSignal = lastLioCompleted;
2263                  }
2264
2265                  if (performSignal) {
2266
2267                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
2268                                  (int)entryp->procp, (int)entryp->uaiocbp,
2269                                  entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
2270
2271                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
2272                 }
2273         }
2274
2275         if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2276                 panic("Close and exit flags set at the same time\n");
2277         }
2278
2279         /*
2280          * need to handle case where a process is trying to exit, exec, or
2281          * close and is currently waiting for active aio requests to complete.
2282          * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2283          * other requests in the active queue for this process.  If there are
2284          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2285          * If there are some still active then do nothing - we only want to
2286          * wakeup when all active aio requests for the process are complete.
2287          *
2288          * Don't need to lock the entry or proc to check the cleanup flag.  It can only be
2289          * set for cancellation, while the entryp is still on a proc list; now it's
2290          * off, so that flag is already set if it's going to be.
2291          */
2292         if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) {
2293                 int             active_requests;
2294
2295                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2296                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2297
2298                 aio_proc_lock_spin(entryp->procp);
2299                 active_requests = aio_active_requests_for_process( entryp->procp );
2300                 if ( active_requests < 1 ) {
2301                         /*
2302                          * no active aio requests for this process, continue exiting.  In this
2303                          * case, there should be no one else waiting ont he proc in AIO...
2304                          */
2305                         wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2306                         aio_proc_unlock(entryp->procp);
2307
2308                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2309                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2310                 } else {
2311                         aio_proc_unlock(entryp->procp);
2312                 }
2313         }
2314
2315         if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) {
2316                 int             active_requests;
2317
2318                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2319                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2320
2321                 aio_proc_lock_spin(entryp->procp);
2322                 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2323                 if ( active_requests < 1 ) {
2324                         /* Can't wakeup_one(); multiple closes might be in progress. */
2325                         wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2326                         aio_proc_unlock(entryp->procp);
2327
2328                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2329                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2330                 } else {
2331                         aio_proc_unlock(entryp->procp);
2332                 }
2333         }
2334         /*
2335          * A thread in aio_suspend() wants to known about completed IOs.  If it checked
2336          * the done list before we moved our AIO there, then it already asserted its wait,
2337          * and we can wake it up without holding the lock.  If it checked the list after
2338          * we did our move, then it already has seen the AIO that we moved.  Herego, we
2339          * can do our wakeup without holding the lock.
2340          */
2341         wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
2342         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2343                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2344
2345         /*
2346          * free the LIO context if the last lio completed and no thread is
2347          * waiting
2348          */
2349         if (lastLioCompleted && (waiter == 0))
2350                 free_lio_context (lio_context);
2351
2352
2353 } /* do_aio_completion */
2354
2355
2356 /*
2357  * do_aio_read
2358  */
2359 static int
2360 do_aio_read( aio_workq_entry *entryp )
2361 {
2362         struct fileproc         *fp;
2363         int                                     error;
2364         struct vfs_context      context;
2365
2366         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2367                 return(error);
2368         if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2369                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2370                 return(EBADF);
2371         }
2372
2373         context.vc_thread = entryp->thread;     /* XXX */
2374         context.vc_ucred = fp->f_fglob->fg_cred;
2375
2376         error = dofileread(&context, fp,
2377                                 entryp->aiocb.aio_buf,
2378                                 entryp->aiocb.aio_nbytes,
2379                                 entryp->aiocb.aio_offset, FOF_OFFSET,
2380                                 &entryp->returnval);
2381         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2382
2383         return( error );
2384
2385 } /* do_aio_read */
2386
2387
2388 /*
2389  * do_aio_write
2390  */
2391 static int
2392 do_aio_write( aio_workq_entry *entryp )
2393 {
2394         struct fileproc                 *fp;
2395         int                             error, flags;
2396         struct vfs_context              context;
2397
2398         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2399                 return(error);
2400         if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2401                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2402                 return(EBADF);
2403         }
2404
2405         flags = FOF_PCRED;
2406         if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) {
2407                 flags |= FOF_OFFSET;
2408         }
2409
2410         context.vc_thread = entryp->thread;     /* XXX */
2411         context.vc_ucred = fp->f_fglob->fg_cred;
2412
2413         /* NB: tell dofilewrite the offset, and to use the proc cred */
2414         error = dofilewrite(&context,
2415                                 fp,
2416                                 entryp->aiocb.aio_buf,
2417                                 entryp->aiocb.aio_nbytes,
2418                                 entryp->aiocb.aio_offset,
2419                                 flags,
2420                                 &entryp->returnval);
2421
2422         if (entryp->returnval)
2423                 fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
2424         else
2425                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2426
2427         return( error );
2428
2429 } /* do_aio_write */
2430
2431
2432 /*
2433  * aio_active_requests_for_process - return number of active async IO
2434  * requests for the given process.
2435  */
2436 static int
2437 aio_active_requests_for_process(proc_t procp )
2438 {
2439         return( procp->p_aio_active_count );
2440
2441 } /* aio_active_requests_for_process */
2442
2443 /*
2444  * Called with the proc locked.
2445  */
2446 static int
2447 aio_proc_active_requests_for_file(proc_t procp, int fd)
2448 {
2449         int count = 0;
2450         aio_workq_entry *entryp;
2451         TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2452                 if (entryp->aiocb.aio_fildes == fd) {
2453                         count++;
2454                 }
2455         }
2456
2457         return count;
2458 } /* aio_active_requests_for_process */
2459
2460
2461
2462 /*
2463  * do_aio_fsync
2464  */
2465 static int
2466 do_aio_fsync( aio_workq_entry *entryp )
2467 {
2468         struct vfs_context      context;
2469         struct vnode            *vp;
2470         struct fileproc         *fp;
2471         int                     sync_flag;
2472         int                     error;
2473
2474         /*
2475          * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2476          *
2477          * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2478          * to mark for update the metadata not strictly necessary for data
2479          * retrieval, rather than forcing it to disk.
2480          *
2481          * If AIO_FSYNC is set, we have to also wait for metadata not really
2482          * necessary to data retrival are committed to stable storage (e.g.
2483          * atime, mtime, ctime, etc.).
2484          *
2485          * Metadata necessary for data retrieval ust be committed to stable
2486          * storage in either case (file length, etc.).
2487          */
2488         if (entryp->flags & AIO_FSYNC)
2489                 sync_flag = MNT_WAIT;
2490         else
2491                 sync_flag = MNT_DWAIT;
2492
2493         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2494         if ( error == 0 ) {
2495                 if ( (error = vnode_getwithref(vp)) ) {
2496                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2497                         entryp->returnval = -1;
2498                         return(error);
2499                 }
2500                 context.vc_thread = current_thread();
2501                 context.vc_ucred = fp->f_fglob->fg_cred;
2502
2503                 error = VNOP_FSYNC( vp, sync_flag, &context);
2504
2505                 (void)vnode_put(vp);
2506
2507                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2508         }
2509         if ( error != 0 )
2510                 entryp->returnval = -1;
2511
2512         return( error );
2513
2514 } /* do_aio_fsync */
2515
2516
2517 /*
2518  * is_already_queued - runs through our queues to see if the given
2519  * aiocbp / process is there.  Returns TRUE if there is a match
2520  * on any of our aio queues.
2521  *
2522  * Called with proc aio lock held (can be held spin)
2523  */
2524 static boolean_t
2525 is_already_queued(proc_t procp,
2526                                         user_addr_t aiocbp )
2527 {
2528         aio_workq_entry                 *entryp;
2529         boolean_t                               result;
2530
2531         result = FALSE;
2532
2533         /* look for matches on our queue of async IO requests that have completed */
2534         TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
2535                 if ( aiocbp == entryp->uaiocbp ) {
2536                         result = TRUE;
2537                         goto ExitThisRoutine;
2538                 }
2539         }
2540
2541         /* look for matches on our queue of active async IO requests */
2542         TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
2543                 if ( aiocbp == entryp->uaiocbp ) {
2544                         result = TRUE;
2545                         goto ExitThisRoutine;
2546                 }
2547         }
2548
2549 ExitThisRoutine:
2550         return( result );
2551
2552 } /* is_already_queued */
2553
2554
2555 static void
2556 free_lio_context(aio_lio_context* context)
2557 {
2558
2559 #if DEBUG
2560         OSDecrementAtomic(&lio_contexts_alloced);
2561 #endif /* DEBUG */
2562
2563         FREE( context, M_TEMP );
2564
2565 } /* free_lio_context */
2566
2567
2568 /*
2569  * aio initialization
2570  */
2571 __private_extern__ void
2572 aio_init( void )
2573 {
2574         int                     i;
2575
2576         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2577         aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2578         aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2579         aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
2580         aio_lock_attr = lck_attr_alloc_init();
2581
2582         lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2583         lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
2584
2585         aio_anchor.aio_inflight_count = 0;
2586         aio_anchor.aio_done_count = 0;
2587         aio_anchor.aio_total_count = 0;
2588         aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2589
2590         for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2591                 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2592         }
2593
2594
2595         i = sizeof( aio_workq_entry );
2596         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2597
2598         _aio_create_worker_threads( aio_worker_threads );
2599
2600 } /* aio_init */
2601
2602
2603 /*
2604  * aio worker threads created here.
2605  */
2606 __private_extern__ void
2607 _aio_create_worker_threads( int num )
2608 {
2609         int                     i;
2610
2611         /* create some worker threads to handle the async IO requests */
2612         for ( i = 0; i < num; i++ ) {
2613                 thread_t                myThread;
2614
2615                 if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
2616                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2617                 }
2618                 else
2619                         thread_deallocate(myThread);
2620         }
2621
2622         return;
2623
2624 } /* _aio_create_worker_threads */
2625
2626 /*
2627  * Return the current activation utask
2628  */
2629 task_t
2630 get_aiotask(void)
2631 {
2632         return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2633 }
2634
2635
2636 /*
2637  * In the case of an aiocb from a
2638  * 32-bit process we need to expand some longs and pointers to the correct
2639  * sizes in order to let downstream code always work on the same type of
2640  * aiocb (in our case that is a user_aiocb)
2641  */
2642 static void
2643 do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2644 {
2645         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2646         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2647         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2648         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2649         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2650         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2651
2652         /* special case here.  since we do not know if sigev_value is an */
2653         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2654         /* means if we send this info back to user space we need to remember */
2655         /* sigev_value was not expanded for the 32-bit case.  */
2656         /* NOTE - this does NOT affect us since we don't support sigev_value */
2657         /* yet in the aio context.  */
2658         //LP64
2659         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2660         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2661         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2662                 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2663         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2664                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2665         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2666                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2667 }
2668
2669 /* Similar for 64-bit user process, so that we don't need to satisfy
2670  * the alignment constraints of the original user64_aiocb
2671  */
2672 static void
2673 do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2674 {
2675         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2676         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2677         the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2678         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2679         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2680         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2681
2682         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2683         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2684         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2685                 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2686         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2687                 my_aiocbp->aio_sigevent.sigev_notify_function;
2688         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2689                 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2690 }