bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29
  30 /*
  31  * todo:
  32  *              1) ramesh is looking into how to replace taking a reference on
  33  *                      the user's map (vm_map_reference()) since it is believed that
  34  *                      would not hold the process for us.
  35  *              2) david is looking into a way for us to set the priority of the
  36  *                      worker threads to match that of the user's thread when the
  37  *                      async IO was queued.
  38  */
  39
  40
  41 /*
  42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  43  */
  44
  45 #include <sys/systm.h>
  46 #include <sys/fcntl.h>
  47 #include <sys/file_internal.h>
  48 #include <sys/filedesc.h>
  49 #include <sys/kernel.h>
  50 #include <sys/vnode_internal.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount_internal.h>
  53 #include <sys/param.h>
  54 #include <sys/proc_internal.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/unistd.h>
  57 #include <sys/user.h>
  58
  59 #include <sys/aio_kern.h>
  60 #include <sys/sysproto.h>
  61
  62 #include <machine/limits.h>
  63
  64 #include <mach/mach_types.h>
  65 #include <kern/kern_types.h>
  66 #include <kern/zalloc.h>
  67 #include <kern/task.h>
  68 #include <kern/sched_prim.h>
  69
  70 #include <vm/vm_map.h>
  71
  72 #include <libkern/OSAtomic.h>
  73
  74 #include <sys/kdebug.h>
  75 #define AIO_work_queued                                 1
  76 #define AIO_worker_wake                                 2
  77 #define AIO_completion_sig                              3
  78 #define AIO_completion_cleanup_wait             4
  79 #define AIO_completion_cleanup_wake             5
  80 #define AIO_completion_suspend_wake     6
  81 #define AIO_fsync_delay                                 7
  82 #define AIO_cancel                                              10
  83 #define AIO_cancel_async_workq                  11
  84 #define AIO_cancel_sync_workq                   12
  85 #define AIO_cancel_activeq                              13
  86 #define AIO_cancel_doneq                                14
  87 #define AIO_fsync                                               20
  88 #define AIO_read                                                30
  89 #define AIO_write                                               40
  90 #define AIO_listio                                              50
  91 #define AIO_error                                               60
  92 #define AIO_error_val                                   61
  93 #define AIO_error_activeq                               62
  94 #define AIO_error_workq                                 63
  95 #define AIO_return                                              70
  96 #define AIO_return_val                                  71
  97 #define AIO_return_activeq                              72
  98 #define AIO_return_workq                                73
  99 #define AIO_exec                                                80
 100 #define AIO_exit                                                90
 101 #define AIO_exit_sleep                                  91
 102 #define AIO_close                                               100
 103 #define AIO_close_sleep                                 101
 104 #define AIO_suspend                                             110
 105 #define AIO_suspend_sleep                               111
 106 #define AIO_worker_thread                               120
 107
 108 #if 0
 109 #undef KERNEL_DEBUG
 110 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 111 #endif
 112
 113 /*
 114  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 115  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 116  * (proc.aio_activeq) when one of our worker threads start the IO.
 117  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 118  * when the IO request completes.  The request remains on aio_doneq until
 119  * user process calls aio_return or the process exits, either way that is our
 120  * trigger to release aio resources.
 121  */
 122 typedef struct aio_workq   {
 123         TAILQ_HEAD(, aio_workq_entry)   aioq_entries;
 124         int                             aioq_count;
 125         lck_mtx_t                       aioq_mtx;
 126         wait_queue_t                    aioq_waitq;
 127 } *aio_workq_t;
 128
 129 #define AIO_NUM_WORK_QUEUES 1
 130 struct aio_anchor_cb
 131 {
 132         volatile int32_t        aio_inflight_count;     /* entries that have been taken from a workq */
 133         volatile int32_t        aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 134         volatile int32_t        aio_total_count;        /* total extant entries */
 135
 136         /* Hash table of queues here */
 137         int                     aio_num_workqs;
 138         struct aio_workq        aio_async_workqs[AIO_NUM_WORK_QUEUES];
 139 };
 140 typedef struct aio_anchor_cb aio_anchor_cb;
 141
 142 struct aio_lio_context
 143 {
 144         int             io_waiter;
 145         int             io_issued;
 146         int             io_completed;
 147 };
 148 typedef struct aio_lio_context aio_lio_context;
 149
 150
 151 /*
 152  * Notes on aio sleep / wake channels.
 153  * We currently pick a couple fields within the proc structure that will allow
 154  * us sleep channels that currently do not collide with any other kernel routines.
 155  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 156  */
 157 #define AIO_SUSPEND_SLEEP_CHAN  p_aio_active_count
 158 #define AIO_CLEANUP_SLEEP_CHAN  p_aio_total_count
 159
 160 #define ASSERT_AIO_FROM_PROC(aiop, theproc)     \
 161         if ((aiop)->procp != (theproc)) {       \
 162                 panic("AIO on a proc list that does not belong to that proc.\n"); \
 163         }
 164
 165 /*
 166  *  LOCAL PROTOTYPES
 167  */
 168 static void             aio_proc_lock(proc_t procp);
 169 static void             aio_proc_lock_spin(proc_t procp);
 170 static void             aio_proc_unlock(proc_t procp);
 171 static lck_mtx_t*       aio_proc_mutex(proc_t procp);
 172 static void             aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
 173 static void             aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
 174 static int              aio_get_process_count(proc_t procp );
 175 static int              aio_active_requests_for_process(proc_t procp );
 176 static int              aio_proc_active_requests_for_file(proc_t procp, int fd);
 177 static boolean_t        is_already_queued(proc_t procp, user_addr_t aiocbp );
 178 static boolean_t        should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
 179
 180 static void             aio_entry_lock(aio_workq_entry *entryp);
 181 static void             aio_entry_lock_spin(aio_workq_entry *entryp);
 182 static aio_workq_t      aio_entry_workq(aio_workq_entry *entryp);
 183 static lck_mtx_t*       aio_entry_mutex(__unused aio_workq_entry *entryp);
 184 static void             aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 185 static void             aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 186 static void             aio_entry_ref_locked(aio_workq_entry *entryp);
 187 static void             aio_entry_unref_locked(aio_workq_entry *entryp);
 188 static void             aio_entry_ref(aio_workq_entry *entryp);
 189 static void             aio_entry_unref(aio_workq_entry *entryp);
 190 static void             aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
 191                                         int wait_for_completion, boolean_t disable_notification);
 192 static int              aio_entry_try_workq_remove(aio_workq_entry *entryp);
 193 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 194 static int              aio_free_request(aio_workq_entry *entryp);
 195
 196 static void             aio_workq_init(aio_workq_t wq);
 197 static void             aio_workq_lock_spin(aio_workq_t wq);
 198 static void             aio_workq_unlock(aio_workq_t wq);
 199 static lck_mtx_t*       aio_workq_mutex(aio_workq_t wq);
 200
 201 static void             aio_work_thread( void );
 202 static aio_workq_entry *aio_get_some_work( void );
 203
 204 static int              aio_get_all_queues_count( void );
 205 static int              aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
 206 static int              aio_validate( aio_workq_entry *entryp );
 207 static int              aio_increment_total_count(void);
 208 static int              aio_decrement_total_count(void);
 209
 210 static int              do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
 211 static void             do_aio_completion( aio_workq_entry *entryp );
 212 static int              do_aio_fsync( aio_workq_entry *entryp );
 213 static int              do_aio_read( aio_workq_entry *entryp );
 214 static int              do_aio_write( aio_workq_entry *entryp );
 215 static void             do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 216 static void             do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 217 static int      lio_create_entry(proc_t procp,
 218                                          user_addr_t aiocbp,
 219                                          void *group_tag,
 220                                          aio_workq_entry **entrypp );
 221 static aio_workq_entry *aio_create_queue_entry(proc_t procp,
 222                                         user_addr_t aiocbp,
 223                                         void *group_tag,
 224                                         int kindOfIO);
 225 static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
 226 static void             free_lio_context(aio_lio_context* context);
 227 static void             aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
 228
 229 #define ASSERT_AIO_PROC_LOCK_OWNED(p)   lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
 230 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q)  lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
 231 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e)  lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
 232
 233 /*
 234  *  EXTERNAL PROTOTYPES
 235  */
 236
 237 /* in ...bsd/kern/sys_generic.c */
 238 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
 239                         user_addr_t bufp, user_size_t nbyte,
 240                         off_t offset, int flags, user_ssize_t *retval );
 241 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 242                          user_addr_t bufp, user_size_t nbyte, off_t offset,
 243                          int flags, user_ssize_t *retval );
 244 #if DEBUG
 245 static uint32_t                         lio_contexts_alloced = 0;
 246 #endif  /* DEBUG */
 247
 248 /*
 249  * aio external global variables.
 250  */
 251 extern int aio_max_requests;                    /* AIO_MAX - configurable */
 252 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 253 extern int aio_worker_threads;                  /* AIO_THREAD_COUNT - configurable */
 254
 255
 256 /*
 257  * aio static variables.
 258  */
 259 static aio_anchor_cb    aio_anchor;
 260 static lck_grp_t        *aio_proc_lock_grp;
 261 static lck_grp_t        *aio_entry_lock_grp;
 262 static lck_grp_t        *aio_queue_lock_grp;
 263 static lck_attr_t       *aio_lock_attr;
 264 static lck_grp_attr_t   *aio_lock_grp_attr;
 265 static struct zone      *aio_workq_zonep;
 266 static lck_mtx_t        aio_entry_mtx;
 267 static lck_mtx_t        aio_proc_mtx;
 268
 269 static void
 270 aio_entry_lock(__unused aio_workq_entry *entryp)
 271 {
 272         lck_mtx_lock(&aio_entry_mtx);
 273 }
 274
 275 static void
 276 aio_entry_lock_spin(__unused aio_workq_entry *entryp)
 277 {
 278         lck_mtx_lock_spin(&aio_entry_mtx);
 279 }
 280
 281 static void
 282 aio_entry_unlock(__unused aio_workq_entry *entryp)
 283 {
 284         lck_mtx_unlock(&aio_entry_mtx);
 285 }
 286
 287 /* Hash */
 288 static aio_workq_t
 289 aio_entry_workq(__unused aio_workq_entry *entryp)
 290 {
 291         return &aio_anchor.aio_async_workqs[0];
 292 }
 293
 294 static lck_mtx_t*
 295 aio_entry_mutex(__unused aio_workq_entry *entryp)
 296 {
 297         return &aio_entry_mtx;
 298 }
 299
 300 static void
 301 aio_workq_init(aio_workq_t wq)
 302 {
 303         TAILQ_INIT(&wq->aioq_entries);
 304         wq->aioq_count = 0;
 305         lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
 306         wq->aioq_waitq = wait_queue_alloc(SYNC_POLICY_FIFO);
 307 }
 308
 309
 310 /*
 311  * Can be passed a queue which is locked spin.
 312  */
 313 static void
 314 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 315 {
 316         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 317
 318         if (entryp->aio_workq_link.tqe_prev == NULL) {
 319                 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
 320         }
 321
 322         TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
 323         queue->aioq_count--;
 324         entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
 325
 326         if (queue->aioq_count  < 0) {
 327                 panic("Negative count on a queue.\n");
 328         }
 329 }
 330
 331 static void
 332 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 333 {
 334         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 335
 336         TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
 337         if (queue->aioq_count  < 0) {
 338                 panic("Negative count on a queue.\n");
 339         }
 340         queue->aioq_count++;
 341 }
 342
 343 static void
 344 aio_proc_lock(proc_t procp)
 345 {
 346         lck_mtx_lock(aio_proc_mutex(procp));
 347 }
 348
 349 static void
 350 aio_proc_lock_spin(proc_t procp)
 351 {
 352         lck_mtx_lock_spin(aio_proc_mutex(procp));
 353 }
 354
 355 static void
 356 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
 357 {
 358         ASSERT_AIO_PROC_LOCK_OWNED(procp);
 359
 360         TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
 361         TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
 362         procp->p_aio_active_count--;
 363         OSIncrementAtomic(&aio_anchor.aio_done_count);
 364 }
 365
 366 static void
 367 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
 368 {
 369         TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
 370         OSDecrementAtomic(&aio_anchor.aio_done_count);
 371         aio_decrement_total_count();
 372         procp->p_aio_total_count--;
 373 }
 374
 375 static void
 376 aio_proc_unlock(proc_t procp)
 377 {
 378         lck_mtx_unlock(aio_proc_mutex(procp));
 379 }
 380
 381 static lck_mtx_t*
 382 aio_proc_mutex(proc_t procp)
 383 {
 384         return &procp->p_mlock;
 385 }
 386
 387 static void
 388 aio_entry_ref_locked(aio_workq_entry *entryp)
 389 {
 390         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 391
 392         if (entryp->aio_refcount < 0) {
 393                 panic("AIO workq entry with a negative refcount.\n");
 394         }
 395         entryp->aio_refcount++;
 396 }
 397
 398
 399 /* Return 1 if you've freed it */
 400 static void
 401 aio_entry_unref_locked(aio_workq_entry *entryp)
 402 {
 403         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 404
 405         entryp->aio_refcount--;
 406         if (entryp->aio_refcount < 0) {
 407                 panic("AIO workq entry with a negative refcount.\n");
 408         }
 409 }
 410
 411 static void
 412 aio_entry_ref(aio_workq_entry *entryp)
 413 {
 414         aio_entry_lock_spin(entryp);
 415         aio_entry_ref_locked(entryp);
 416         aio_entry_unlock(entryp);
 417 }
 418 static void
 419 aio_entry_unref(aio_workq_entry *entryp)
 420 {
 421         aio_entry_lock_spin(entryp);
 422         aio_entry_unref_locked(entryp);
 423
 424         if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
 425                 aio_entry_unlock(entryp);
 426                 aio_free_request(entryp);
 427         } else {
 428                 aio_entry_unlock(entryp);
 429         }
 430
 431         return;
 432 }
 433
 434 static void
 435 aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
 436 {
 437         aio_entry_lock_spin(entryp);
 438
 439         if (cancelled) {
 440                 aio_entry_ref_locked(entryp);
 441                 entryp->errorval = ECANCELED;
 442                 entryp->returnval = -1;
 443         }
 444
 445         if ( wait_for_completion ) {
 446                 entryp->flags |= wait_for_completion; /* flag for special completion processing */
 447         }
 448
 449         if ( disable_notification ) {
 450                 entryp->flags |= AIO_DISABLE; /* Don't want a signal */
 451         }
 452
 453         aio_entry_unlock(entryp);
 454 }
 455
 456 static int
 457 aio_entry_try_workq_remove(aio_workq_entry *entryp)
 458 {
 459         /* Can only be cancelled if it's still on a work queue */
 460         if (entryp->aio_workq_link.tqe_prev != NULL) {
 461                 aio_workq_t queue;
 462
 463                 /* Will have to check again under the lock */
 464                 queue = aio_entry_workq(entryp);
 465                 aio_workq_lock_spin(queue);
 466                 if (entryp->aio_workq_link.tqe_prev != NULL) {
 467                         aio_workq_remove_entry_locked(queue, entryp);
 468                         aio_workq_unlock(queue);
 469                         return 1;
 470                 }  else {
 471                         aio_workq_unlock(queue);
 472                 }
 473         }
 474
 475         return 0;
 476 }
 477
 478 static void
 479 aio_workq_lock_spin(aio_workq_t wq)
 480 {
 481         lck_mtx_lock_spin(aio_workq_mutex(wq));
 482 }
 483
 484 static void
 485 aio_workq_unlock(aio_workq_t wq)
 486 {
 487         lck_mtx_unlock(aio_workq_mutex(wq));
 488 }
 489
 490 static lck_mtx_t*
 491 aio_workq_mutex(aio_workq_t wq)
 492 {
 493         return &wq->aioq_mtx;
 494 }
 495
 496 /*
 497  * aio_cancel - attempt to cancel one or more async IO requests currently
 498  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 499  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 500  * is NULL then all outstanding async IO request for the given file
 501  * descriptor are cancelled (if possible).
 502  */
 503 int
 504 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
 505 {
 506         struct user_aiocb               my_aiocb;
 507         int                                                     result;
 508
 509         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 510                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 511
 512         /* quick check to see if there are any async IO requests queued up */
 513         if (aio_get_all_queues_count() < 1) {
 514                 result = 0;
 515                 *retval = AIO_ALLDONE;
 516                 goto ExitRoutine;
 517         }
 518
 519         *retval = -1;
 520         if ( uap->aiocbp != USER_ADDR_NULL ) {
 521                 if ( proc_is64bit(p) ) {
 522                         struct user64_aiocb aiocb64;
 523
 524                         result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
 525                         if (result == 0 )
 526                                 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
 527
 528                 } else {
 529                         struct user32_aiocb aiocb32;
 530
 531                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
 532                         if ( result == 0 )
 533                                 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
 534                 }
 535
 536                 if ( result != 0 ) {
 537                         result = EAGAIN;
 538                         goto ExitRoutine;
 539                 }
 540
 541                 /* NOTE - POSIX standard says a mismatch between the file */
 542                 /* descriptor passed in and the file descriptor embedded in */
 543                 /* the aiocb causes unspecified results.  We return EBADF in */
 544                 /* that situation.  */
 545                 if ( uap->fd != my_aiocb.aio_fildes ) {
 546                         result = EBADF;
 547                         goto ExitRoutine;
 548                 }
 549         }
 550
 551         aio_proc_lock(p);
 552         result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
 553         ASSERT_AIO_PROC_LOCK_OWNED(p);
 554         aio_proc_unlock(p);
 555
 556         if ( result != -1 ) {
 557                 *retval = result;
 558                 result = 0;
 559                 goto ExitRoutine;
 560         }
 561
 562         result = EBADF;
 563
 564 ExitRoutine:
 565         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 566                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 567
 568         return( result );
 569
 570 } /* aio_cancel */
 571
 572
 573 /*
 574  * _aio_close - internal function used to clean up async IO requests for
 575  * a file descriptor that is closing.
 576  * THIS MAY BLOCK.
 577  */
 578 __private_extern__ void
 579 _aio_close(proc_t p, int fd )
 580 {
 581         int                     error;
 582
 583         /* quick check to see if there are any async IO requests queued up */
 584         if (aio_get_all_queues_count() < 1) {
 585                 return;
 586         }
 587
 588         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 589                           (int)p, fd, 0, 0, 0 );
 590
 591         /* cancel all async IO requests on our todo queues for this file descriptor */
 592         aio_proc_lock(p);
 593         error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
 594         ASSERT_AIO_PROC_LOCK_OWNED(p);
 595         if ( error == AIO_NOTCANCELED ) {
 596                 /*
 597                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 598                  * and file descriptor on the active async IO queue.  Active requests cannot
 599                  * be cancelled so we must wait for them to complete.  We will get a special
 600                  * wake up call on our channel used to sleep for ALL active requests to
 601                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 602                  * when we must wait for all active aio requests.
 603                  */
 604
 605                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 606                                   (int)p, fd, 0, 0, 0 );
 607
 608                 while (aio_proc_active_requests_for_file(p, fd) > 0) {
 609                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 );
 610                 }
 611
 612         }
 613
 614         aio_proc_unlock(p);
 615
 616         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 617                           (int)p, fd, 0, 0, 0 );
 618
 619         return;
 620
 621 } /* _aio_close */
 622
 623
 624 /*
 625  * aio_error - return the error status associated with the async IO
 626  * request referred to by uap->aiocbp.  The error status is the errno
 627  * value that would be set by the corresponding IO request (read, wrtie,
 628  * fdatasync, or sync).
 629  */
 630 int
 631 aio_error(proc_t p, struct aio_error_args *uap, int *retval )
 632 {
 633         aio_workq_entry                         *entryp;
 634         int                                                     error;
 635
 636         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 637                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 638
 639         /* see if there are any aios to check */
 640         if (aio_get_all_queues_count() < 1) {
 641                 return EINVAL;
 642         }
 643
 644         aio_proc_lock(p);
 645
 646         /* look for a match on our queue of async IO requests that have completed */
 647         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 648                 if ( entryp->uaiocbp == uap->aiocbp ) {
 649                         ASSERT_AIO_FROM_PROC(entryp, p);
 650
 651                         aio_entry_lock_spin(entryp);
 652                         *retval = entryp->errorval;
 653                         error = 0;
 654                         aio_entry_unlock(entryp);
 655                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 656                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 657                         goto ExitRoutine;
 658                 }
 659         }
 660
 661         /* look for a match on our queue of active async IO requests */
 662         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 663                 if ( entryp->uaiocbp == uap->aiocbp ) {
 664                         ASSERT_AIO_FROM_PROC(entryp, p);
 665                         *retval = EINPROGRESS;
 666                         error = 0;
 667                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 668                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 669                         goto ExitRoutine;
 670                 }
 671         }
 672
 673         error = EINVAL;
 674
 675 ExitRoutine:
 676         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 677                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 678         aio_proc_unlock(p);
 679
 680         return( error );
 681
 682 } /* aio_error */
 683
 684
 685 /*
 686  * aio_fsync - asynchronously force all IO operations associated
 687  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 688  * queued at the time of the call to the synchronized completion state.
 689  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 690  * fdatasync() call.
 691  */
 692 int
 693 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
 694 {
 695         int                     error;
 696         int                     fsync_kind;
 697
 698         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 699                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 700
 701         *retval = 0;
 702         /* 0 := O_SYNC for binary backward compatibility with Panther */
 703         if (uap->op == O_SYNC || uap->op == 0)
 704                 fsync_kind = AIO_FSYNC;
 705         else if ( uap->op == O_DSYNC )
 706                 fsync_kind = AIO_DSYNC;
 707         else {
 708                 *retval = -1;
 709                 error = EINVAL;
 710                 goto ExitRoutine;
 711         }
 712
 713         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 714         if ( error != 0 )
 715                 *retval = -1;
 716
 717 ExitRoutine:
 718         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 719                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 720
 721         return( error );
 722
 723 } /* aio_fsync */
 724
 725
 726 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 727  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 728  * (uap->aiocbp->aio_buf).
 729  */
 730 int
 731 aio_read(proc_t p, struct aio_read_args *uap, int *retval )
 732 {
 733         int                     error;
 734
 735         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 736                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 737
 738         *retval = 0;
 739
 740         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 741         if ( error != 0 )
 742                 *retval = -1;
 743
 744         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 745                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 746
 747         return( error );
 748
 749 } /* aio_read */
 750
 751
 752 /*
 753  * aio_return - return the return status associated with the async IO
 754  * request referred to by uap->aiocbp.  The return status is the value
 755  * that would be returned by corresponding IO request (read, write,
 756  * fdatasync, or sync).  This is where we release kernel resources
 757  * held for async IO call associated with the given aiocb pointer.
 758  */
 759 int
 760 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
 761 {
 762         aio_workq_entry                         *entryp;
 763         int                                                     error;
 764         boolean_t                                       proc_lock_held = FALSE;
 765
 766         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 767                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 768
 769         /* See if there are any entries to check */
 770         if (aio_get_all_queues_count() < 1) {
 771                 error = EINVAL;
 772                 goto ExitRoutine;
 773         }
 774
 775         aio_proc_lock(p);
 776         proc_lock_held = TRUE;
 777         *retval = 0;
 778
 779         /* look for a match on our queue of async IO requests that have completed */
 780         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 781                 ASSERT_AIO_FROM_PROC(entryp, p);
 782                 if ( entryp->uaiocbp == uap->aiocbp ) {
 783                         /* Done and valid for aio_return(), pull it off the list */
 784                         aio_proc_remove_done_locked(p, entryp);
 785
 786                         /* Drop the proc lock, but keep the entry locked */
 787                         aio_entry_lock(entryp);
 788                         aio_proc_unlock(p);
 789                         proc_lock_held = FALSE;
 790
 791                         *retval = entryp->returnval;
 792                         error = 0;
 793
 794                         /* No references and off all lists, safe to free */
 795                         if (entryp->aio_refcount == 0) {
 796                                 aio_entry_unlock(entryp);
 797                                 aio_free_request(entryp);
 798                         }
 799                         else {
 800                                 /* Whoever has the refcount will have to free it */
 801                                 entryp->flags |= AIO_DO_FREE;
 802                                 aio_entry_unlock(entryp);
 803                         }
 804
 805
 806                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 807                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 808                         goto ExitRoutine;
 809                 }
 810         }
 811
 812         /* look for a match on our queue of active async IO requests */
 813         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 814                 ASSERT_AIO_FROM_PROC(entryp, p);
 815                 if ( entryp->uaiocbp == uap->aiocbp ) {
 816                         error = EINPROGRESS;
 817                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 818                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 819                         goto ExitRoutine;
 820                 }
 821         }
 822
 823         error = EINVAL;
 824
 825 ExitRoutine:
 826         if (proc_lock_held)
 827                 aio_proc_unlock(p);
 828         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 829                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 830
 831         return( error );
 832
 833 } /* aio_return */
 834
 835
 836 /*
 837  * _aio_exec - internal function used to clean up async IO requests for
 838  * a process that is going away due to exec().  We cancel any async IOs
 839  * we can and wait for those already active.  We also disable signaling
 840  * for cancelled or active aio requests that complete.
 841  * This routine MAY block!
 842  */
 843 __private_extern__ void
 844 _aio_exec(proc_t p )
 845 {
 846
 847         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 848                           (int)p, 0, 0, 0, 0 );
 849
 850         _aio_exit( p );
 851
 852         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 853                           (int)p, 0, 0, 0, 0 );
 854
 855         return;
 856
 857 } /* _aio_exec */
 858
 859
 860 /*
 861  * _aio_exit - internal function used to clean up async IO requests for
 862  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 863  * we can and wait for those already active.  We also disable signaling
 864  * for cancelled or active aio requests that complete.  This routine MAY block!
 865  */
 866 __private_extern__ void
 867 _aio_exit(proc_t p )
 868 {
 869         int                                             error;
 870         aio_workq_entry                 *entryp;
 871
 872
 873         /* quick check to see if there are any async IO requests queued up */
 874         if (aio_get_all_queues_count() < 1) {
 875                 return;
 876         }
 877
 878         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 879                           (int)p, 0, 0, 0, 0 );
 880
 881         aio_proc_lock(p);
 882
 883         /*
 884          * cancel async IO requests on the todo work queue and wait for those
 885          * already active to complete.
 886          */
 887         error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
 888         ASSERT_AIO_PROC_LOCK_OWNED(p);
 889         if ( error == AIO_NOTCANCELED ) {
 890                 /*
 891                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 892                  * on the active async IO queue.  Active requests cannot be cancelled so we
 893                  * must wait for them to complete.  We will get a special wake up call on
 894                  * our channel used to sleep for ALL active requests to complete.  This sleep
 895                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 896                  * active aio requests.
 897                  */
 898
 899                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 900                                   (int)p, 0, 0, 0, 0 );
 901
 902                 while (p->p_aio_active_count != 0) {
 903                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
 904                 }
 905         }
 906
 907         if (p->p_aio_active_count != 0) {
 908                 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
 909         }
 910
 911         /* release all aio resources used by this process */
 912         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 913         while ( entryp != NULL ) {
 914                 ASSERT_AIO_FROM_PROC(entryp, p);
 915                 aio_workq_entry                 *next_entryp;
 916
 917                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 918                 aio_proc_remove_done_locked(p, entryp);
 919
 920                 /* we cannot free requests that are still completing */
 921                 aio_entry_lock_spin(entryp);
 922                 if (entryp->aio_refcount == 0) {
 923                         aio_proc_unlock(p);
 924                         aio_entry_unlock(entryp);
 925                         aio_free_request(entryp);
 926
 927                         /* need to start over since aio_doneq may have been */
 928                         /* changed while we were away.  */
 929                         aio_proc_lock(p);
 930                         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 931                         continue;
 932                 }
 933                 else {
 934                         /* whoever has the reference will have to do the free */
 935                         entryp->flags |= AIO_DO_FREE;
 936                 }
 937
 938                 aio_entry_unlock(entryp);
 939                 entryp = next_entryp;
 940         }
 941
 942         aio_proc_unlock(p);
 943
 944         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 945                           (int)p, 0, 0, 0, 0 );
 946         return;
 947
 948 } /* _aio_exit */
 949
 950
 951 static boolean_t
 952 should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
 953 {
 954         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 955                         (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 956                         (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 957                 return TRUE;
 958         }
 959
 960         return FALSE;
 961 }
 962
 963 /*
 964  * do_aio_cancel_locked - cancel async IO requests (if possible).  We get called by
 965  * aio_cancel, close, and at exit.
 966  * There are three modes of operation: 1) cancel all async IOs for a process -
 967  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 968  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 969  * aiocbp.
 970  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 971  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 972  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 973  * were already complete.
 974  * WARNING - do not deference aiocbp in this routine, it may point to user
 975  * land data that has not been copied in (when called from aio_cancel() )
 976  *
 977  * Called with proc locked, and returns the same way.
 978  */
 979 static int
 980 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
 981         int wait_for_completion, boolean_t disable_notification )
 982 {
 983         ASSERT_AIO_PROC_LOCK_OWNED(p);
 984
 985         aio_workq_entry                 *entryp;
 986         int                                             result;
 987
 988         result = -1;
 989
 990         /* look for a match on our queue of async todo work. */
 991         entryp = TAILQ_FIRST(&p->p_aio_activeq);
 992         while ( entryp != NULL ) {
 993                 ASSERT_AIO_FROM_PROC(entryp, p);
 994                 aio_workq_entry                 *next_entryp;
 995
 996                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 997                 if (!should_cancel(entryp, aiocbp, fd)) {
 998                         entryp = next_entryp;
 999                         continue;
1000                 }
1001
1002                 /* Can only be cancelled if it's still on a work queue */
1003                 if (aio_entry_try_workq_remove(entryp) != 0) {
1004                         /* Have removed from workq. Update entry state and take a ref */
1005                         aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
1006
1007                         /* Put on the proc done queue and update counts, then unlock the proc */
1008                         aio_proc_move_done_locked(p, entryp);
1009                         aio_proc_unlock(p);
1010
1011                         /* Now it's officially cancelled.  Do the completion */
1012                         result = AIO_CANCELED;
1013                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1014                                         (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1015                         do_aio_completion(entryp);
1016
1017                         /* This will free if the aio_return() has already happened ... */
1018                         aio_entry_unref(entryp);
1019                         aio_proc_lock(p);
1020
1021                         if ( aiocbp != USER_ADDR_NULL ) {
1022                                 return( result );
1023                         }
1024
1025                         /*
1026                          * Restart from the head of the proc active queue since it
1027                          * may have been changed while we were away doing completion
1028                          * processing.
1029                          *
1030                          * Note that if we found an uncancellable AIO before, we will
1031                          * either find it again or discover that it's been completed,
1032                          * so resetting the result will not cause us to return success
1033                          * despite outstanding AIOs.
1034                          */
1035                         entryp = TAILQ_FIRST(&p->p_aio_activeq);
1036                         result = -1; /* As if beginning anew */
1037                 } else {
1038                         /*
1039                          * It's been taken off the active queue already, i.e. is in flight.
1040                          * All we can do is ask for notification.
1041                          */
1042                         result = AIO_NOTCANCELED;
1043
1044                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
1045                                         (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1046
1047                         /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1048                         aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
1049
1050                         if ( aiocbp != USER_ADDR_NULL ) {
1051                                 return( result );
1052                         }
1053                         entryp = next_entryp;
1054                 }
1055         } /* while... */
1056
1057         /*
1058          * if we didn't find any matches on the todo or active queues then look for a
1059          * match on our queue of async IO requests that have completed and if found
1060          * return AIO_ALLDONE result.
1061          *
1062          * Proc AIO lock is still held.
1063          */
1064         if ( result == -1 ) {
1065                 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1066                         ASSERT_AIO_FROM_PROC(entryp, p);
1067                         if (should_cancel(entryp, aiocbp, fd)) {
1068                                 result = AIO_ALLDONE;
1069                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
1070                                                 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1071
1072                                 if ( aiocbp != USER_ADDR_NULL ) {
1073                                         return( result );
1074                                 }
1075                         }
1076                 }
1077         }
1078
1079         return( result );
1080
1081 }
1082  /* do_aio_cancel_locked */
1083
1084
1085 /*
1086  * aio_suspend - suspend the calling thread until at least one of the async
1087  * IO operations referenced by uap->aiocblist has completed, until a signal
1088  * interrupts the function, or uap->timeoutp time interval (optional) has
1089  * passed.
1090  * Returns 0 if one or more async IOs have completed else -1 and errno is
1091  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1092  * woke us up.
1093  */
1094 int
1095 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1096 {
1097         __pthread_testcancel(1);
1098         return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
1099 }
1100
1101
1102 int
1103 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
1104 {
1105         int                                     error;
1106         int                                     i, count;
1107         uint64_t                        abstime;
1108         struct user_timespec ts;
1109         aio_workq_entry         *entryp;
1110         user_addr_t                     *aiocbpp;
1111
1112         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1113                           (int)p, uap->nent, 0, 0, 0 );
1114
1115         *retval = -1;
1116         abstime = 0;
1117         aiocbpp = NULL;
1118
1119         count = aio_get_all_queues_count( );
1120         if ( count < 1 ) {
1121                 error = EINVAL;
1122                 goto ExitThisRoutine;
1123         }
1124
1125         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
1126                 error = EINVAL;
1127                 goto ExitThisRoutine;
1128         }
1129
1130         if ( uap->timeoutp != USER_ADDR_NULL ) {
1131                 if ( proc_is64bit(p) ) {
1132                         struct user64_timespec temp;
1133                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1134                         if ( error == 0 ) {
1135                                 ts.tv_sec = temp.tv_sec;
1136                                 ts.tv_nsec = temp.tv_nsec;
1137                         }
1138                 }
1139                 else {
1140                         struct user32_timespec temp;
1141                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1142                         if ( error == 0 ) {
1143                                 ts.tv_sec = temp.tv_sec;
1144                                 ts.tv_nsec = temp.tv_nsec;
1145                         }
1146                 }
1147                 if ( error != 0 ) {
1148                         error = EAGAIN;
1149                         goto ExitThisRoutine;
1150                 }
1151
1152                 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
1153                         error = EINVAL;
1154                         goto ExitThisRoutine;
1155                 }
1156
1157                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1158                                                                          &abstime );
1159                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
1160         }
1161
1162         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1163         if ( aiocbpp == NULL ) {
1164                 error = EAGAIN;
1165                 goto ExitThisRoutine;
1166         }
1167
1168         /* check list of aio requests to see if any have completed */
1169 check_for_our_aiocbp:
1170         aio_proc_lock_spin(p);
1171         for ( i = 0; i < uap->nent; i++ ) {
1172                 user_addr_t     aiocbp;
1173
1174                 /* NULL elements are legal so check for 'em */
1175                 aiocbp = *(aiocbpp + i);
1176                 if ( aiocbp == USER_ADDR_NULL )
1177                         continue;
1178
1179                 /* return immediately if any aio request in the list is done */
1180                 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1181                         ASSERT_AIO_FROM_PROC(entryp, p);
1182                         if ( entryp->uaiocbp == aiocbp ) {
1183                                 aio_proc_unlock(p);
1184                                 *retval = 0;
1185                                 error = 0;
1186                                 goto ExitThisRoutine;
1187                         }
1188                 }
1189         } /* for ( ; i < uap->nent; ) */
1190
1191         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1192                           (int)p, uap->nent, 0, 0, 0 );
1193
1194         /*
1195          * wait for an async IO to complete or a signal fires or timeout expires.
1196          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1197          * interrupts us.  If an async IO completes before a signal fires or our
1198          * timeout expires, we get a wakeup call from aio_work_thread().
1199          */
1200
1201         error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
1202         if ( error == 0 ) {
1203                 /*
1204                  * got our wakeup call from aio_work_thread().
1205                  * Since we can get a wakeup on this channel from another thread in the
1206                  * same process we head back up to make sure this is for the correct aiocbp.
1207                  * If it is the correct aiocbp we will return from where we do the check
1208                  * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1209                  * else we will fall out and just sleep again.
1210                  */
1211                 goto check_for_our_aiocbp;
1212         }
1213         else if ( error == EWOULDBLOCK ) {
1214                 /* our timeout expired */
1215                 error = EAGAIN;
1216         }
1217         else {
1218                 /* we were interrupted */
1219                 error = EINTR;
1220         }
1221
1222 ExitThisRoutine:
1223         if ( aiocbpp != NULL )
1224                 FREE( aiocbpp, M_TEMP );
1225
1226         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1227                           (int)p, uap->nent, error, 0, 0 );
1228
1229         return( error );
1230
1231 } /* aio_suspend */
1232
1233
1234 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1235  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1236  * (uap->aiocbp->aio_buf).
1237  */
1238
1239 int
1240 aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1241 {
1242         int                     error;
1243
1244         *retval = 0;
1245
1246         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1247                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1248
1249         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1250         if ( error != 0 )
1251                 *retval = -1;
1252
1253         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1254                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1255
1256         return( error );
1257
1258 } /* aio_write */
1259
1260
1261 static user_addr_t *
1262 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
1263 {
1264         user_addr_t     *aiocbpp;
1265         int             i, result;
1266
1267         /* we reserve enough space for largest possible pointer size */
1268         MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1269         if ( aiocbpp == NULL )
1270                 goto err;
1271
1272         /* copyin our aiocb pointers from list */
1273         result = copyin( aiocblist, aiocbpp,
1274                         proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1275                                             : (nent * sizeof(user32_addr_t)) );
1276         if ( result) {
1277                 FREE( aiocbpp, M_TEMP );
1278                 aiocbpp = NULL;
1279                 goto err;
1280         }
1281
1282         /*
1283          * We depend on a list of user_addr_t's so we need to
1284          * munge and expand when these pointers came from a
1285          * 32-bit process
1286          */
1287         if ( !proc_is64bit(procp) ) {
1288                 /* copy from last to first to deal with overlap */
1289                 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1290                 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1291
1292                 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1293                         *my_addrp = (user_addr_t) (*my_ptrp);
1294                 }
1295         }
1296
1297 err:
1298         return (aiocbpp);
1299 }
1300
1301
1302 static int
1303 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1304 {
1305         int     result = 0;
1306
1307         if (sigp == USER_ADDR_NULL)
1308                 goto out;
1309
1310         /*
1311          * We need to munge aio_sigevent since it contains pointers.
1312          * Since we do not know if sigev_value is an int or a ptr we do
1313          * NOT cast the ptr to a user_addr_t.   This means if we send
1314          * this info back to user space we need to remember sigev_value
1315          * was not expanded for the 32-bit case.
1316          *
1317          * Notes:        This does NOT affect us since we don't support
1318          *              sigev_value yet in the aio context.
1319          */
1320         if ( proc_is64bit(procp) ) {
1321                 struct user64_sigevent sigevent64;
1322
1323                 result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
1324                 if ( result == 0 ) {
1325                         sigev->sigev_notify = sigevent64.sigev_notify;
1326                         sigev->sigev_signo = sigevent64.sigev_signo;
1327                         sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1328                         sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1329                         sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1330                 }
1331
1332         } else {
1333                 struct user32_sigevent sigevent32;
1334
1335                 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1336                 if ( result == 0 ) {
1337                         sigev->sigev_notify = sigevent32.sigev_notify;
1338                         sigev->sigev_signo = sigevent32.sigev_signo;
1339                         sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1340                         sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1341                         sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1342                 }
1343         }
1344
1345         if ( result != 0 ) {
1346                 result = EAGAIN;
1347         }
1348
1349 out:
1350         return (result);
1351 }
1352
1353 /*
1354  * aio_enqueue_work
1355  *
1356  * Queue up the entry on the aio asynchronous work queue in priority order
1357  * based on the relative priority of the request.  We calculate the relative
1358  * priority using the nice value of the caller and the value
1359  *
1360  * Parameters:  procp                   Process queueing the I/O
1361  *              entryp                  The work queue entry being queued
1362  *
1363  * Returns:     (void)                  No failure modes
1364  *
1365  * Notes:       This function is used for both lio_listio and aio
1366  *
1367  * XXX:         At some point, we may have to consider thread priority
1368  *              rather than process priority, but we don't maintain the
1369  *              adjusted priority for threads the POSIX way.
1370  *
1371  *
1372  * Called with proc locked.
1373  */
1374 static void
1375 aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1376 {
1377 #if 0
1378         aio_workq_entry *my_entryp;     /* used for insertion sort */
1379 #endif /* 0 */
1380         aio_workq_t queue = aio_entry_workq(entryp);
1381
1382         if (proc_locked == 0) {
1383                 aio_proc_lock(procp);
1384         }
1385
1386         ASSERT_AIO_PROC_LOCK_OWNED(procp);
1387
1388         /* Onto proc queue */
1389         TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp,  aio_proc_link);
1390         procp->p_aio_active_count++;
1391         procp->p_aio_total_count++;
1392
1393         /* And work queue */
1394         aio_workq_lock_spin(queue);
1395         aio_workq_add_entry_locked(queue, entryp);
1396         wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED, -1);
1397         aio_workq_unlock(queue);
1398
1399         if (proc_locked == 0) {
1400                 aio_proc_unlock(procp);
1401         }
1402
1403 #if 0
1404         /*
1405          * Procedure:
1406          *
1407          * (1)  The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1408          * (2)  The normalized nice value is in the range 0..((2 * NZERO) - 1)
1409          *      which is [0..39], with 0 not being used.  In nice values, the
1410          *      lower the nice value, the higher the priority.
1411          * (3)  The normalized scheduling prioritiy is the highest nice value
1412          *      minus the current nice value.  In I/O scheduling priority, the
1413          *      higher the value the lower the priority, so it is the inverse
1414          *      of the nice value (the higher the number, the higher the I/O
1415          *      priority).
1416          * (4)  From the normalized scheduling priority, we subtract the
1417          *      request priority to get the request priority value number;
1418          *      this means that requests are only capable of depressing their
1419          *      priority relative to other requests,
1420          */
1421         entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1422
1423         /* only premit depressing the priority */
1424         if (entryp->aiocb.aio_reqprio < 0)
1425                 entryp->aiocb.aio_reqprio = 0;
1426         if (entryp->aiocb.aio_reqprio > 0) {
1427                 entryp->priority -= entryp->aiocb.aio_reqprio;
1428                 if (entryp->priority < 0)
1429                         entryp->priority = 0;
1430         }
1431
1432         /* Insertion sort the entry; lowest ->priority to highest */
1433         TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1434                 if ( entryp->priority <= my_entryp->priority) {
1435                         TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1436                         break;
1437                 }
1438         }
1439         if (my_entryp == NULL)
1440                 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1441 #endif /* 0 */
1442 }
1443
1444
1445 /*
1446  * lio_listio - initiate a list of IO requests.  We process the list of
1447  * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1448  * (mode == LIO_NOWAIT).
1449  *
1450  * The caller gets error and return status for each aiocb in the list
1451  * via aio_error and aio_return.  We must keep completed requests until
1452  * released by the aio_return call.
1453  */
1454 int
1455 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1456 {
1457         int                             i;
1458         int                             call_result;
1459         int                             result;
1460         int                             old_count;
1461         aio_workq_entry                 **entryp_listp;
1462         user_addr_t                     *aiocbpp;
1463         struct user_sigevent            aiosigev;
1464         aio_lio_context         *lio_context;
1465         boolean_t                       free_context = FALSE;
1466
1467         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1468                           (int)p, uap->nent, uap->mode, 0, 0 );
1469
1470         entryp_listp = NULL;
1471         lio_context = NULL;
1472         aiocbpp = NULL;
1473         call_result = -1;
1474         *retval = -1;
1475         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1476                 call_result = EINVAL;
1477                 goto ExitRoutine;
1478         }
1479
1480         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1481                 call_result = EINVAL;
1482                 goto ExitRoutine;
1483         }
1484
1485         /*
1486          * allocate a list of aio_workq_entry pointers that we will use
1487          * to queue up all our requests at once while holding our lock.
1488          */
1489         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1490         if ( entryp_listp == NULL ) {
1491                 call_result = EAGAIN;
1492                 goto ExitRoutine;
1493         }
1494
1495         MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1496         if ( lio_context == NULL ) {
1497                 call_result = EAGAIN;
1498                 goto ExitRoutine;
1499         }
1500
1501 #if DEBUG
1502         OSIncrementAtomic(&lio_contexts_alloced);
1503 #endif /* DEBUG */
1504
1505         bzero(lio_context, sizeof(aio_lio_context));
1506
1507         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1508         if ( aiocbpp == NULL ) {
1509                 call_result = EAGAIN;
1510                 goto ExitRoutine;
1511         }
1512
1513         /*
1514          * Use sigevent passed in to lio_listio for each of our calls, but
1515          * only do completion notification after the last request completes.
1516          */
1517         bzero(&aiosigev, sizeof(aiosigev));
1518         /* Only copy in an sigev if the user supplied one */
1519         if (uap->sigp != USER_ADDR_NULL) {
1520                 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1521                 if ( call_result)
1522                         goto ExitRoutine;
1523         }
1524
1525         /* process list of aio requests */
1526         lio_context->io_issued = uap->nent;
1527         lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
1528         for ( i = 0; i < uap->nent; i++ ) {
1529                 user_addr_t my_aiocbp;
1530                 aio_workq_entry                         *entryp;
1531
1532                 *(entryp_listp + i) = NULL;
1533                 my_aiocbp = *(aiocbpp + i);
1534
1535                 /* NULL elements are legal so check for 'em */
1536                 if ( my_aiocbp == USER_ADDR_NULL ) {
1537                         aio_proc_lock_spin(p);
1538                         lio_context->io_issued--;
1539                         aio_proc_unlock(p);
1540                         continue;
1541                 }
1542
1543                 /*
1544                  * We use lio_context to mark IO requests for delayed completion
1545                  * processing which means we wait until all IO requests in the
1546                  * group have completed before we either return to the caller
1547                  * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1548                  *
1549                  * We use the address of the lio_context for this, since it is
1550                  * unique in the address space.
1551                  */
1552                 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
1553                 if ( result != 0 && call_result == -1 )
1554                         call_result = result;
1555
1556                 /* NULL elements are legal so check for 'em */
1557                 entryp = *(entryp_listp + i);
1558                 if ( entryp == NULL ) {
1559                         aio_proc_lock_spin(p);
1560                         lio_context->io_issued--;
1561                         aio_proc_unlock(p);
1562                         continue;
1563                 }
1564
1565                 if ( uap->mode == LIO_NOWAIT ) {
1566                         /* Set signal hander, if any */
1567                         entryp->aiocb.aio_sigevent = aiosigev;
1568                 } else {
1569                         /* flag that this thread blocks pending completion */
1570                         entryp->flags |= AIO_LIO_NOTIFY;
1571                 }
1572
1573                 /* check our aio limits to throttle bad or rude user land behavior */
1574                 old_count = aio_increment_total_count();
1575
1576                 aio_proc_lock_spin(p);
1577                 if ( old_count >= aio_max_requests ||
1578                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1579                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1580
1581                         lio_context->io_issued--;
1582                         aio_proc_unlock(p);
1583
1584                         aio_decrement_total_count();
1585
1586                         if ( call_result == -1 )
1587                                 call_result = EAGAIN;
1588                         aio_free_request(entryp);
1589                         entryp_listp[i] = NULL;
1590                         continue;
1591                 }
1592
1593                 lck_mtx_convert_spin(aio_proc_mutex(p));
1594                 aio_enqueue_work(p, entryp, 1);
1595                 aio_proc_unlock(p);
1596
1597                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1598                                   (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1599         }
1600
1601         switch(uap->mode) {
1602         case LIO_WAIT:
1603                 aio_proc_lock_spin(p);
1604                 while (lio_context->io_completed < lio_context->io_issued) {
1605                         result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
1606
1607                         /* If we were interrupted, fail out (even if all finished) */
1608                         if (result != 0) {
1609                                 call_result = EINTR;
1610                                 lio_context->io_waiter = 0;
1611                                 break;
1612                         }
1613                 }
1614
1615                 /* If all IOs have finished must free it */
1616                 if (lio_context->io_completed == lio_context->io_issued) {
1617                         free_context = TRUE;
1618                 }
1619
1620                 aio_proc_unlock(p);
1621                 break;
1622
1623         case LIO_NOWAIT:
1624                 break;
1625         }
1626
1627         /* call_result == -1 means we had no trouble queueing up requests */
1628         if ( call_result == -1 ) {
1629                 call_result = 0;
1630                 *retval = 0;
1631         }
1632
1633 ExitRoutine:
1634         if ( entryp_listp != NULL )
1635                 FREE( entryp_listp, M_TEMP );
1636         if ( aiocbpp != NULL )
1637                 FREE( aiocbpp, M_TEMP );
1638         if ((lio_context != NULL) && ((lio_context->io_issued == 0) || (free_context == TRUE))) {
1639                 free_lio_context(lio_context);
1640         }
1641
1642         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1643                           (int)p, call_result, 0, 0, 0 );
1644
1645         return( call_result );
1646
1647 } /* lio_listio */
1648
1649
1650 /*
1651  * aio worker thread.  this is where all the real work gets done.
1652  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1653  * after new work is queued up.
1654  */
1655 static void
1656 aio_work_thread( void )
1657 {
1658         aio_workq_entry                 *entryp;
1659         int                     error;
1660         vm_map_t                currentmap;
1661         vm_map_t                oldmap = VM_MAP_NULL;
1662         task_t                  oldaiotask = TASK_NULL;
1663         struct uthread  *uthreadp = NULL;
1664
1665         for( ;; ) {
1666                 /*
1667                  * returns with the entry ref'ed.
1668                  * sleeps until work is available.
1669                  */
1670                 entryp = aio_get_some_work();
1671
1672                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1673                                 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1674
1675                 /*
1676                  * Assume the target's address space identity for the duration
1677                  * of the IO.  Note: don't need to have the entryp locked,
1678                  * because the proc and map don't change until it's freed.
1679                  */
1680                 currentmap = get_task_map( (current_proc())->task );
1681                 if ( currentmap != entryp->aio_map ) {
1682                         uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1683                         oldaiotask = uthreadp->uu_aio_task;
1684                         uthreadp->uu_aio_task = entryp->procp->task;
1685                         oldmap = vm_map_switch( entryp->aio_map );
1686                 }
1687
1688                 if ( (entryp->flags & AIO_READ) != 0 ) {
1689                         error = do_aio_read( entryp );
1690                 }
1691                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1692                         error = do_aio_write( entryp );
1693                 }
1694                 else if ( (entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0 ) {
1695                         error = do_aio_fsync( entryp );
1696                 }
1697                 else {
1698                         printf( "%s - unknown aio request - flags 0x%02X \n",
1699                                         __FUNCTION__, entryp->flags );
1700                         error = EINVAL;
1701                 }
1702
1703                 /* Restore old map */
1704                 if ( currentmap != entryp->aio_map ) {
1705                         (void) vm_map_switch( oldmap );
1706                         uthreadp->uu_aio_task = oldaiotask;
1707                 }
1708
1709                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1710                                 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1711                                 entryp->returnval, 0 );
1712
1713
1714                 /* XXX COUNTS */
1715                 aio_entry_lock_spin(entryp);
1716                 entryp->errorval = error;
1717                 aio_entry_unlock(entryp);
1718
1719                 /* we're done with the IO request so pop it off the active queue and */
1720                 /* push it on the done queue */
1721                 aio_proc_lock(entryp->procp);
1722                 aio_proc_move_done_locked(entryp->procp, entryp);
1723                 aio_proc_unlock(entryp->procp);
1724
1725                 OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1726
1727                 /* remove our reference to the user land map. */
1728                 if ( VM_MAP_NULL != entryp->aio_map ) {
1729                         vm_map_t                my_map;
1730
1731                         my_map = entryp->aio_map;
1732                         entryp->aio_map = VM_MAP_NULL;
1733                         vm_map_deallocate( my_map );
1734                 }
1735
1736                 /* Provide notifications */
1737                 do_aio_completion( entryp );
1738
1739                 /* Will free if needed */
1740                 aio_entry_unref(entryp);
1741
1742         } /* for ( ;; ) */
1743
1744         /* NOT REACHED */
1745
1746 } /* aio_work_thread */
1747
1748
1749 /*
1750  * aio_get_some_work - get the next async IO request that is ready to be executed.
1751  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1752  * IO requests at the time the aio_fsync call came in have completed.
1753  * NOTE - AIO_LOCK must be held by caller
1754  */
1755 static aio_workq_entry *
1756 aio_get_some_work( void )
1757 {
1758         aio_workq_entry                         *entryp = NULL;
1759         aio_workq_t                             queue = NULL;
1760
1761         /* Just one queue for the moment.  In the future there will be many. */
1762         queue = &aio_anchor.aio_async_workqs[0];
1763         aio_workq_lock_spin(queue);
1764         if (queue->aioq_count == 0) {
1765                 goto nowork;
1766         }
1767
1768         /*
1769          * Hold the queue lock.
1770          *
1771          * pop some work off the work queue and add to our active queue
1772          * Always start with the queue lock held.
1773          */
1774         for(;;) {
1775                 /*
1776                  * Pull of of work queue.  Once it's off, it can't be cancelled,
1777                  * so we can take our ref once we drop the queue lock.
1778                  */
1779                 entryp = TAILQ_FIRST(&queue->aioq_entries);
1780
1781                 /*
1782                  * If there's no work or only fsyncs that need delay, go to sleep
1783                  * and then start anew from aio_work_thread
1784                  */
1785                 if (entryp == NULL) {
1786                         goto nowork;
1787                 }
1788
1789                 aio_workq_remove_entry_locked(queue, entryp);
1790
1791                 aio_workq_unlock(queue);
1792
1793                 /*
1794                  * Check if it's an fsync that must be delayed.  No need to lock the entry;
1795                  * that flag would have been set at initialization.
1796                  */
1797                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1798                         /*
1799                          * Check for unfinished operations on the same file
1800                          * in this proc's queue.
1801                          */
1802                         aio_proc_lock_spin(entryp->procp);
1803                         if ( aio_delay_fsync_request( entryp ) ) {
1804                                 /* It needs to be delayed.  Put it back on the end of the work queue */
1805                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1806                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1807
1808                                 aio_proc_unlock(entryp->procp);
1809
1810                                 aio_workq_lock_spin(queue);
1811                                 aio_workq_add_entry_locked(queue, entryp);
1812                                 continue;
1813                         }
1814                         aio_proc_unlock(entryp->procp);
1815                 }
1816
1817                 break;
1818         }
1819
1820         aio_entry_ref(entryp);
1821
1822         OSIncrementAtomic(&aio_anchor.aio_inflight_count);
1823         return( entryp );
1824
1825 nowork:
1826         /* We will wake up when someone enqueues something */
1827         wait_queue_assert_wait(queue->aioq_waitq, queue, THREAD_UNINT, 0);
1828         aio_workq_unlock(queue);
1829         thread_block( (thread_continue_t)aio_work_thread );
1830
1831         // notreached
1832         return NULL;
1833 }
1834
1835 /*
1836  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1837  * A big, simple hammer: only send it off if it's the most recently filed IO which has
1838  * not been completed.
1839  */
1840 static boolean_t
1841 aio_delay_fsync_request( aio_workq_entry *entryp )
1842 {
1843         if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1844                 return FALSE;
1845         }
1846
1847         return TRUE;
1848 } /* aio_delay_fsync_request */
1849
1850 static aio_workq_entry *
1851 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
1852 {
1853         aio_workq_entry *entryp;
1854         int             result = 0;
1855
1856         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1857         if ( entryp == NULL ) {
1858                 result = EAGAIN;
1859                 goto error_exit;
1860         }
1861
1862         bzero( entryp, sizeof(*entryp) );
1863
1864         /* fill in the rest of the aio_workq_entry */
1865         entryp->procp = procp;
1866         entryp->uaiocbp = aiocbp;
1867         entryp->flags |= kindOfIO;
1868         entryp->group_tag = group_tag;
1869         entryp->aio_map = VM_MAP_NULL;
1870         entryp->aio_refcount = 0;
1871
1872         if ( proc_is64bit(procp) ) {
1873                 struct user64_aiocb aiocb64;
1874
1875                 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
1876                 if (result == 0 )
1877                         do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1878
1879         } else {
1880                 struct user32_aiocb aiocb32;
1881
1882                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1883                 if ( result == 0 )
1884                         do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
1885         }
1886
1887         if ( result != 0 ) {
1888                 result = EAGAIN;
1889                 goto error_exit;
1890         }
1891
1892         /* get a reference to the user land map in order to keep it around */
1893         entryp->aio_map = get_task_map( procp->task );
1894         vm_map_reference( entryp->aio_map );
1895
1896         /* do some more validation on the aiocb and embedded file descriptor */
1897         result = aio_validate( entryp );
1898         if ( result != 0 )
1899                 goto error_exit_with_ref;
1900
1901         /* get a reference on the current_thread, which is passed in vfs_context. */
1902         entryp->thread = current_thread();
1903         thread_reference( entryp->thread );
1904         return ( entryp );
1905
1906 error_exit_with_ref:
1907         if ( VM_MAP_NULL != entryp->aio_map ) {
1908                 vm_map_deallocate( entryp->aio_map );
1909         }
1910 error_exit:
1911         if ( result && entryp != NULL ) {
1912                 zfree( aio_workq_zonep, entryp );
1913                 entryp = NULL;
1914         }
1915
1916         return ( entryp );
1917 }
1918
1919
1920 /*
1921  * aio_queue_async_request - queue up an async IO request on our work queue then
1922  * wake up one of our worker threads to do the actual work.  We get a reference
1923  * to our caller's user land map in order to keep it around while we are
1924  * processing the request.
1925  */
1926 static int
1927 aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1928 {
1929         aio_workq_entry *entryp;
1930         int             result;
1931         int             old_count;
1932
1933         old_count = aio_increment_total_count();
1934         if (old_count >= aio_max_requests) {
1935                 result = EAGAIN;
1936                 goto error_noalloc;
1937         }
1938
1939         entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1940         if ( entryp == NULL ) {
1941                 result = EAGAIN;
1942                 goto error_noalloc;
1943         }
1944
1945
1946         aio_proc_lock_spin(procp);
1947
1948         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1949                 result = EAGAIN;
1950                 goto error_exit;
1951         }
1952
1953         /* check our aio limits to throttle bad or rude user land behavior */
1954         if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
1955                 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
1956                 result = EAGAIN;
1957                 goto error_exit;
1958         }
1959
1960         /* Add the IO to proc and work queues, wake up threads as appropriate */
1961         lck_mtx_convert_spin(aio_proc_mutex(procp));
1962         aio_enqueue_work(procp, entryp, 1);
1963
1964         aio_proc_unlock(procp);
1965
1966         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1967                           (int)procp, (int)aiocbp, 0, 0, 0 );
1968
1969         return( 0 );
1970
1971 error_exit:
1972         /*
1973          * This entry has not been queued up so no worries about
1974          * unlocked state and aio_map
1975          */
1976         aio_proc_unlock(procp);
1977         aio_free_request(entryp);
1978
1979 error_noalloc:
1980         aio_decrement_total_count();
1981
1982         return( result );
1983
1984 } /* aio_queue_async_request */
1985
1986
1987 /*
1988  * lio_create_entry
1989  *
1990  * Allocate an aio_workq_entry and fill it in.  If all goes well return 0
1991  * and pass the aio_workq_entry pointer back to our caller.
1992  *
1993  * Parameters:  procp                   The process makign the request
1994  *              aiocbp                  The aio context buffer pointer
1995  *              group_tag               The group tag used to indicate a
1996  *                                      group of operations has completed
1997  *              entrypp                 Pointer to the pointer to receive the
1998  *                                      address of the created aio_workq_entry
1999  *
2000  * Returns:     0                       Successfully created
2001  *              EAGAIN                  Try again (usually resource shortage)
2002  *
2003  *
2004  * Notes:       We get a reference to our caller's user land map in order
2005  *              to keep it around while we are processing the request.
2006  *
2007  *              lio_listio calls behave differently at completion they do
2008  *              completion notification when all async IO requests have
2009  *              completed.  We use group_tag to tag IO requests that behave
2010  *              in the delay notification manner.
2011  *
2012  *              All synchronous operations are considered to not have a
2013  *              signal routine associated with them (sigp == USER_ADDR_NULL).
2014  */
2015 static int
2016 lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2017                 aio_workq_entry **entrypp )
2018 {
2019         aio_workq_entry *entryp;
2020         int             result;
2021
2022         entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2023         if ( entryp == NULL ) {
2024                 result = EAGAIN;
2025                 goto error_exit;
2026         }
2027
2028         /*
2029          * Look for lio_listio LIO_NOP requests and ignore them; this is
2030          * not really an error, but we need to free our aio_workq_entry.
2031          */
2032         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
2033                 result = 0;
2034                 goto error_exit;
2035         }
2036
2037         *entrypp = entryp;
2038         return( 0 );
2039
2040 error_exit:
2041
2042         if ( entryp != NULL ) {
2043                 /*
2044                  * This entry has not been queued up so no worries about
2045                  * unlocked state and aio_map
2046                  */
2047                 aio_free_request(entryp);
2048         }
2049
2050         return( result );
2051
2052 } /* lio_create_entry */
2053
2054
2055 /*
2056  * aio_free_request - remove our reference on the user land map and
2057  * free the work queue entry resources.  The entry is off all lists
2058  * and has zero refcount, so no one can have a pointer to it.
2059  */
2060
2061 static int
2062 aio_free_request(aio_workq_entry *entryp)
2063 {
2064         /* remove our reference to the user land map. */
2065         if ( VM_MAP_NULL != entryp->aio_map) {
2066                 vm_map_deallocate(entryp->aio_map);
2067         }
2068
2069         /* remove our reference to thread which enqueued the request */
2070         if ( NULL != entryp->thread ) {
2071                 thread_deallocate( entryp->thread );
2072         }
2073
2074         entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2075
2076         zfree( aio_workq_zonep, entryp );
2077
2078         return( 0 );
2079
2080 } /* aio_free_request */
2081
2082
2083 /*
2084  * aio_validate
2085  *
2086  * validate the aiocb passed in by one of the aio syscalls.
2087  */
2088 static int
2089 aio_validate( aio_workq_entry *entryp )
2090 {
2091         struct fileproc                                 *fp;
2092         int                                                     flag;
2093         int                                                     result;
2094
2095         result = 0;
2096
2097         if ( (entryp->flags & AIO_LIO) != 0 ) {
2098                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
2099                         entryp->flags |= AIO_READ;
2100                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
2101                         entryp->flags |= AIO_WRITE;
2102                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
2103                         return( 0 );
2104                 else
2105                         return( EINVAL );
2106         }
2107
2108         flag = FREAD;
2109         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0 ) {
2110                 flag = FWRITE;
2111         }
2112
2113         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
2114                 if ( entryp->aiocb.aio_nbytes > INT_MAX         ||
2115                          entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2116                          entryp->aiocb.aio_offset < 0 )
2117                         return( EINVAL );
2118         }
2119
2120         /*
2121          * validate aiocb.aio_sigevent.  at this point we only support
2122          * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
2123          * sigev_value, sigev_notify_function, and sigev_notify_attributes
2124          * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
2125          * with no [RTS] (RalTime Signal) option group support.
2126          */
2127         switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
2128         case SIGEV_SIGNAL:
2129             {
2130                 int             signum;
2131
2132                 /* make sure we have a valid signal number */
2133                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
2134                 if ( signum <= 0 || signum >= NSIG ||
2135                          signum == SIGKILL || signum == SIGSTOP )
2136                         return (EINVAL);
2137             }
2138             break;
2139
2140         case SIGEV_NONE:
2141                 break;
2142
2143         case SIGEV_THREAD:
2144                 /* Unsupported [RTS] */
2145
2146         default:
2147                 return (EINVAL);
2148         }
2149
2150         /* validate the file descriptor and that the file was opened
2151          * for the appropriate read / write access.
2152          */
2153         proc_fdlock(entryp->procp);
2154
2155         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
2156         if ( result == 0 ) {
2157                 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
2158                         /* we don't have read or write access */
2159                         result = EBADF;
2160                 }
2161                 else if ( FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE ) {
2162                         /* this is not a file */
2163                         result = ESPIPE;
2164                 } else
2165                         fp->f_flags |= FP_AIOISSUED;
2166
2167                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
2168         }
2169         else {
2170                 result = EBADF;
2171         }
2172
2173         proc_fdunlock(entryp->procp);
2174
2175         return( result );
2176
2177 } /* aio_validate */
2178
2179 static int
2180 aio_increment_total_count()
2181 {
2182         return OSIncrementAtomic(&aio_anchor.aio_total_count);
2183 }
2184
2185 static int
2186 aio_decrement_total_count()
2187 {
2188         int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2189         if (old <= 0) {
2190                 panic("Negative total AIO count!\n");
2191         }
2192
2193         return old;
2194 }
2195
2196 static int
2197 aio_get_process_count(proc_t procp )
2198 {
2199         return procp->p_aio_total_count;
2200
2201 } /* aio_get_process_count */
2202
2203 static int
2204 aio_get_all_queues_count( void )
2205 {
2206         return aio_anchor.aio_total_count;
2207
2208 } /* aio_get_all_queues_count */
2209
2210
2211 /*
2212  * do_aio_completion.  Handle async IO completion.
2213  */
2214 static void
2215 do_aio_completion( aio_workq_entry *entryp )
2216 {
2217
2218         boolean_t               lastLioCompleted = FALSE;
2219         aio_lio_context *lio_context = NULL;
2220         int waiter = 0;
2221
2222         lio_context = (aio_lio_context *)entryp->group_tag;
2223
2224         if (lio_context != NULL) {
2225
2226                 aio_proc_lock_spin(entryp->procp);
2227
2228                 /* Account for this I/O completing. */
2229                 lio_context->io_completed++;
2230
2231                 /* Are we done with this lio context? */
2232                 if (lio_context->io_issued == lio_context->io_completed) {
2233                         lastLioCompleted = TRUE;
2234                 }
2235
2236                 waiter = lio_context->io_waiter;
2237
2238                 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2239                 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2240                         /* wake up the waiter */
2241                         wakeup(lio_context);
2242                 }
2243
2244                 aio_proc_unlock(entryp->procp);
2245         }
2246
2247         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2248                  (entryp->flags & AIO_DISABLE) == 0 ) {
2249
2250                 boolean_t       performSignal = FALSE;
2251                  if (lio_context == NULL) {
2252                         performSignal = TRUE;
2253                  }
2254                  else {
2255                         /*
2256                          * If this was the last request in the group and a signal
2257                          * is desired, send one.
2258                          */
2259                         performSignal = lastLioCompleted;
2260                  }
2261
2262                  if (performSignal) {
2263
2264                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
2265                                  (int)entryp->procp, (int)entryp->uaiocbp,
2266                                  entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
2267
2268                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
2269                 }
2270         }
2271
2272         if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2273                 panic("Close and exit flags set at the same time\n");
2274         }
2275
2276         /*
2277          * need to handle case where a process is trying to exit, exec, or
2278          * close and is currently waiting for active aio requests to complete.
2279          * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2280          * other requests in the active queue for this process.  If there are
2281          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2282          * If there are some still active then do nothing - we only want to
2283          * wakeup when all active aio requests for the process are complete.
2284          *
2285          * Don't need to lock the entry or proc to check the cleanup flag.  It can only be
2286          * set for cancellation, while the entryp is still on a proc list; now it's
2287          * off, so that flag is already set if it's going to be.
2288          */
2289         if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) {
2290                 int             active_requests;
2291
2292                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2293                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2294
2295                 aio_proc_lock_spin(entryp->procp);
2296                 active_requests = aio_active_requests_for_process( entryp->procp );
2297                 if ( active_requests < 1 ) {
2298                         /*
2299                          * no active aio requests for this process, continue exiting.  In this
2300                          * case, there should be no one else waiting ont he proc in AIO...
2301                          */
2302                         wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2303                         aio_proc_unlock(entryp->procp);
2304
2305                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2306                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2307                 } else {
2308                         aio_proc_unlock(entryp->procp);
2309                 }
2310         }
2311
2312         if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) {
2313                 int             active_requests;
2314
2315                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2316                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2317
2318                 aio_proc_lock_spin(entryp->procp);
2319                 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2320                 if ( active_requests < 1 ) {
2321                         /* Can't wakeup_one(); multiple closes might be in progress. */
2322                         wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2323                         aio_proc_unlock(entryp->procp);
2324
2325                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2326                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2327                 } else {
2328                         aio_proc_unlock(entryp->procp);
2329                 }
2330         }
2331         /*
2332          * A thread in aio_suspend() wants to known about completed IOs.  If it checked
2333          * the done list before we moved our AIO there, then it already asserted its wait,
2334          * and we can wake it up without holding the lock.  If it checked the list after
2335          * we did our move, then it already has seen the AIO that we moved.  Herego, we
2336          * can do our wakeup without holding the lock.
2337          */
2338         wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
2339         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2340                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2341
2342         /*
2343          * free the LIO context if the last lio completed and no thread is
2344          * waiting
2345          */
2346         if (lastLioCompleted && (waiter == 0))
2347                 free_lio_context (lio_context);
2348
2349
2350 } /* do_aio_completion */
2351
2352
2353 /*
2354  * do_aio_read
2355  */
2356 static int
2357 do_aio_read( aio_workq_entry *entryp )
2358 {
2359         struct fileproc         *fp;
2360         int                                     error;
2361         struct vfs_context      context;
2362
2363         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2364                 return(error);
2365         if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2366                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2367                 return(EBADF);
2368         }
2369
2370         context.vc_thread = entryp->thread;     /* XXX */
2371         context.vc_ucred = fp->f_fglob->fg_cred;
2372
2373         error = dofileread(&context, fp,
2374                                 entryp->aiocb.aio_buf,
2375                                 entryp->aiocb.aio_nbytes,
2376                                 entryp->aiocb.aio_offset, FOF_OFFSET,
2377                                 &entryp->returnval);
2378         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2379
2380         return( error );
2381
2382 } /* do_aio_read */
2383
2384
2385 /*
2386  * do_aio_write
2387  */
2388 static int
2389 do_aio_write( aio_workq_entry *entryp )
2390 {
2391         struct fileproc                 *fp;
2392         int                             error, flags;
2393         struct vfs_context              context;
2394
2395         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2396                 return(error);
2397         if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2398                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2399                 return(EBADF);
2400         }
2401
2402         flags = FOF_PCRED;
2403         if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) {
2404                 flags |= FOF_OFFSET;
2405         }
2406
2407         context.vc_thread = entryp->thread;     /* XXX */
2408         context.vc_ucred = fp->f_fglob->fg_cred;
2409
2410         /* NB: tell dofilewrite the offset, and to use the proc cred */
2411         error = dofilewrite(&context,
2412                                 fp,
2413                                 entryp->aiocb.aio_buf,
2414                                 entryp->aiocb.aio_nbytes,
2415                                 entryp->aiocb.aio_offset,
2416                                 flags,
2417                                 &entryp->returnval);
2418
2419         if (entryp->returnval)
2420                 fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
2421         else
2422                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2423
2424         return( error );
2425
2426 } /* do_aio_write */
2427
2428
2429 /*
2430  * aio_active_requests_for_process - return number of active async IO
2431  * requests for the given process.
2432  */
2433 static int
2434 aio_active_requests_for_process(proc_t procp )
2435 {
2436         return( procp->p_aio_active_count );
2437
2438 } /* aio_active_requests_for_process */
2439
2440 /*
2441  * Called with the proc locked.
2442  */
2443 static int
2444 aio_proc_active_requests_for_file(proc_t procp, int fd)
2445 {
2446         int count = 0;
2447         aio_workq_entry *entryp;
2448         TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2449                 if (entryp->aiocb.aio_fildes == fd) {
2450                         count++;
2451                 }
2452         }
2453
2454         return count;
2455 } /* aio_active_requests_for_process */
2456
2457
2458
2459 /*
2460  * do_aio_fsync
2461  */
2462 static int
2463 do_aio_fsync( aio_workq_entry *entryp )
2464 {
2465         struct vfs_context      context;
2466         struct vnode            *vp;
2467         struct fileproc         *fp;
2468         int                     sync_flag;
2469         int                     error;
2470
2471         /*
2472          * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2473          *
2474          * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2475          * to mark for update the metadata not strictly necessary for data
2476          * retrieval, rather than forcing it to disk.
2477          *
2478          * If AIO_FSYNC is set, we have to also wait for metadata not really
2479          * necessary to data retrival are committed to stable storage (e.g.
2480          * atime, mtime, ctime, etc.).
2481          *
2482          * Metadata necessary for data retrieval ust be committed to stable
2483          * storage in either case (file length, etc.).
2484          */
2485         if (entryp->flags & AIO_FSYNC)
2486                 sync_flag = MNT_WAIT;
2487         else
2488                 sync_flag = MNT_DWAIT;
2489
2490         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2491         if ( error == 0 ) {
2492                 if ( (error = vnode_getwithref(vp)) ) {
2493                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2494                         entryp->returnval = -1;
2495                         return(error);
2496                 }
2497                 context.vc_thread = current_thread();
2498                 context.vc_ucred = fp->f_fglob->fg_cred;
2499
2500                 error = VNOP_FSYNC( vp, sync_flag, &context);
2501
2502                 (void)vnode_put(vp);
2503
2504                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2505         }
2506         if ( error != 0 )
2507                 entryp->returnval = -1;
2508
2509         return( error );
2510
2511 } /* do_aio_fsync */
2512
2513
2514 /*
2515  * is_already_queued - runs through our queues to see if the given
2516  * aiocbp / process is there.  Returns TRUE if there is a match
2517  * on any of our aio queues.
2518  *
2519  * Called with proc aio lock held (can be held spin)
2520  */
2521 static boolean_t
2522 is_already_queued(proc_t procp,
2523                                         user_addr_t aiocbp )
2524 {
2525         aio_workq_entry                 *entryp;
2526         boolean_t                               result;
2527
2528         result = FALSE;
2529
2530         /* look for matches on our queue of async IO requests that have completed */
2531         TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
2532                 if ( aiocbp == entryp->uaiocbp ) {
2533                         result = TRUE;
2534                         goto ExitThisRoutine;
2535                 }
2536         }
2537
2538         /* look for matches on our queue of active async IO requests */
2539         TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
2540                 if ( aiocbp == entryp->uaiocbp ) {
2541                         result = TRUE;
2542                         goto ExitThisRoutine;
2543                 }
2544         }
2545
2546 ExitThisRoutine:
2547         return( result );
2548
2549 } /* is_already_queued */
2550
2551
2552 static void
2553 free_lio_context(aio_lio_context* context)
2554 {
2555
2556 #if DEBUG
2557         OSDecrementAtomic(&lio_contexts_alloced);
2558 #endif /* DEBUG */
2559
2560         FREE( context, M_TEMP );
2561
2562 } /* free_lio_context */
2563
2564
2565 /*
2566  * aio initialization
2567  */
2568 __private_extern__ void
2569 aio_init( void )
2570 {
2571         int                     i;
2572
2573         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2574         aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2575         aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2576         aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
2577         aio_lock_attr = lck_attr_alloc_init();
2578
2579         lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2580         lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
2581
2582         aio_anchor.aio_inflight_count = 0;
2583         aio_anchor.aio_done_count = 0;
2584         aio_anchor.aio_total_count = 0;
2585         aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2586
2587         for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2588                 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2589         }
2590
2591
2592         i = sizeof( aio_workq_entry );
2593         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2594
2595         _aio_create_worker_threads( aio_worker_threads );
2596
2597 } /* aio_init */
2598
2599
2600 /*
2601  * aio worker threads created here.
2602  */
2603 __private_extern__ void
2604 _aio_create_worker_threads( int num )
2605 {
2606         int                     i;
2607
2608         /* create some worker threads to handle the async IO requests */
2609         for ( i = 0; i < num; i++ ) {
2610                 thread_t                myThread;
2611
2612                 if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
2613                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2614                 }
2615                 else
2616                         thread_deallocate(myThread);
2617         }
2618
2619         return;
2620
2621 } /* _aio_create_worker_threads */
2622
2623 /*
2624  * Return the current activation utask
2625  */
2626 task_t
2627 get_aiotask(void)
2628 {
2629         return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2630 }
2631
2632
2633 /*
2634  * In the case of an aiocb from a
2635  * 32-bit process we need to expand some longs and pointers to the correct
2636  * sizes in order to let downstream code always work on the same type of
2637  * aiocb (in our case that is a user_aiocb)
2638  */
2639 static void
2640 do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2641 {
2642         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2643         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2644         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2645         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2646         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2647         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2648
2649         /* special case here.  since we do not know if sigev_value is an */
2650         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2651         /* means if we send this info back to user space we need to remember */
2652         /* sigev_value was not expanded for the 32-bit case.  */
2653         /* NOTE - this does NOT affect us since we don't support sigev_value */
2654         /* yet in the aio context.  */
2655         //LP64
2656         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2657         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2658         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2659                 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2660         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2661                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2662         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2663                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2664 }
2665
2666 /* Similar for 64-bit user process, so that we don't need to satisfy
2667  * the alignment constraints of the original user64_aiocb
2668  */
2669 static void
2670 do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2671 {
2672         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2673         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2674         the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2675         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2676         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2677         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2678
2679         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2680         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2681         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2682                 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2683         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2684                 my_aiocbp->aio_sigevent.sigev_notify_function;
2685         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2686                 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2687 }