bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29
  30 /*
  31  * todo:
  32  *              1) ramesh is looking into how to replace taking a reference on
  33  *                      the user's map (vm_map_reference()) since it is believed that
  34  *                      would not hold the process for us.
  35  *              2) david is looking into a way for us to set the priority of the
  36  *                      worker threads to match that of the user's thread when the
  37  *                      async IO was queued.
  38  */
  39
  40
  41 /*
  42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  43  */
  44
  45 #include <sys/systm.h>
  46 #include <sys/fcntl.h>
  47 #include <sys/file_internal.h>
  48 #include <sys/filedesc.h>
  49 #include <sys/kernel.h>
  50 #include <sys/vnode_internal.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount_internal.h>
  53 #include <sys/param.h>
  54 #include <sys/proc_internal.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/unistd.h>
  57 #include <sys/user.h>
  58
  59 #include <sys/aio_kern.h>
  60 #include <sys/sysproto.h>
  61
  62 #include <machine/limits.h>
  63
  64 #include <mach/mach_types.h>
  65 #include <kern/kern_types.h>
  66 #include <kern/zalloc.h>
  67 #include <kern/task.h>
  68 #include <kern/sched_prim.h>
  69
  70 #include <vm/vm_map.h>
  71
  72 #include <libkern/OSAtomic.h>
  73
  74 #include <sys/kdebug.h>
  75 #define AIO_work_queued                                 1
  76 #define AIO_worker_wake                                 2
  77 #define AIO_completion_sig                              3
  78 #define AIO_completion_cleanup_wait             4
  79 #define AIO_completion_cleanup_wake             5
  80 #define AIO_completion_suspend_wake     6
  81 #define AIO_fsync_delay                                 7
  82 #define AIO_cancel                                              10
  83 #define AIO_cancel_async_workq                  11
  84 #define AIO_cancel_sync_workq                   12
  85 #define AIO_cancel_activeq                              13
  86 #define AIO_cancel_doneq                                14
  87 #define AIO_fsync                                               20
  88 #define AIO_read                                                30
  89 #define AIO_write                                               40
  90 #define AIO_listio                                              50
  91 #define AIO_error                                               60
  92 #define AIO_error_val                                   61
  93 #define AIO_error_activeq                               62
  94 #define AIO_error_workq                                 63
  95 #define AIO_return                                              70
  96 #define AIO_return_val                                  71
  97 #define AIO_return_activeq                              72
  98 #define AIO_return_workq                                73
  99 #define AIO_exec                                                80
 100 #define AIO_exit                                                90
 101 #define AIO_exit_sleep                                  91
 102 #define AIO_close                                               100
 103 #define AIO_close_sleep                                 101
 104 #define AIO_suspend                                             110
 105 #define AIO_suspend_sleep                               111
 106 #define AIO_worker_thread                               120
 107
 108 #if 0
 109 #undef KERNEL_DEBUG
 110 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 111 #endif
 112
 113 /*
 114  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 115  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 116  * (proc.aio_activeq) when one of our worker threads start the IO.
 117  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 118  * when the IO request completes.  The request remains on aio_doneq until
 119  * user process calls aio_return or the process exits, either way that is our
 120  * trigger to release aio resources.
 121  */
 122 typedef struct aio_workq   {
 123         TAILQ_HEAD(, aio_workq_entry)   aioq_entries;
 124         int                             aioq_count;
 125         lck_mtx_t                       aioq_mtx;
 126         wait_queue_t                    aioq_waitq;
 127 } *aio_workq_t;
 128
 129 #define AIO_NUM_WORK_QUEUES 1
 130 struct aio_anchor_cb
 131 {
 132         volatile int32_t        aio_inflight_count;     /* entries that have been taken from a workq */
 133         volatile int32_t        aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 134         volatile int32_t        aio_total_count;        /* total extant entries */
 135
 136         /* Hash table of queues here */
 137         int                     aio_num_workqs;
 138         struct aio_workq        aio_async_workqs[AIO_NUM_WORK_QUEUES];
 139 };
 140 typedef struct aio_anchor_cb aio_anchor_cb;
 141
 142 struct aio_lio_context
 143 {
 144         int             io_waiter;
 145         int             io_issued;
 146         int             io_completed;
 147 };
 148 typedef struct aio_lio_context aio_lio_context;
 149
 150
 151 /*
 152  * Notes on aio sleep / wake channels.
 153  * We currently pick a couple fields within the proc structure that will allow
 154  * us sleep channels that currently do not collide with any other kernel routines.
 155  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 156  */
 157 #define AIO_SUSPEND_SLEEP_CHAN  p_aio_active_count
 158 #define AIO_CLEANUP_SLEEP_CHAN  p_aio_total_count
 159
 160 #define ASSERT_AIO_FROM_PROC(aiop, theproc)     \
 161         if ((aiop)->procp != (theproc)) {       \
 162                 panic("AIO on a proc list that does not belong to that proc.\n"); \
 163         }
 164
 165 /*
 166  *  LOCAL PROTOTYPES
 167  */
 168 static void             aio_proc_lock(proc_t procp);
 169 static void             aio_proc_lock_spin(proc_t procp);
 170 static void             aio_proc_unlock(proc_t procp);
 171 static lck_mtx_t*       aio_proc_mutex(proc_t procp);
 172 static void             aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
 173 static void             aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
 174 static int              aio_get_process_count(proc_t procp );
 175 static int              aio_active_requests_for_process(proc_t procp );
 176 static int              aio_proc_active_requests_for_file(proc_t procp, int fd);
 177 static boolean_t        is_already_queued(proc_t procp, user_addr_t aiocbp );
 178 static boolean_t        should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
 179
 180 static void             aio_entry_lock(aio_workq_entry *entryp);
 181 static void             aio_entry_lock_spin(aio_workq_entry *entryp);
 182 static aio_workq_t      aio_entry_workq(aio_workq_entry *entryp);
 183 static lck_mtx_t*       aio_entry_mutex(__unused aio_workq_entry *entryp);
 184 static void             aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 185 static void             aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 186 static void             aio_entry_ref_locked(aio_workq_entry *entryp);
 187 static void             aio_entry_unref_locked(aio_workq_entry *entryp);
 188 static void             aio_entry_ref(aio_workq_entry *entryp);
 189 static void             aio_entry_unref(aio_workq_entry *entryp);
 190 static void             aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
 191                                         int wait_for_completion, boolean_t disable_notification);
 192 static int              aio_entry_try_workq_remove(aio_workq_entry *entryp);
 193 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 194 static int              aio_free_request(aio_workq_entry *entryp);
 195
 196 static void             aio_workq_init(aio_workq_t wq);
 197 static void             aio_workq_lock_spin(aio_workq_t wq);
 198 static void             aio_workq_unlock(aio_workq_t wq);
 199 static lck_mtx_t*       aio_workq_mutex(aio_workq_t wq);
 200
 201 static void             aio_work_thread( void );
 202 static aio_workq_entry *aio_get_some_work( void );
 203
 204 static int              aio_get_all_queues_count( void );
 205 static int              aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
 206 static int              aio_validate( aio_workq_entry *entryp );
 207 static int              aio_increment_total_count(void);
 208 static int              aio_decrement_total_count(void);
 209
 210 static int              do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
 211 static void             do_aio_completion( aio_workq_entry *entryp );
 212 static int              do_aio_fsync( aio_workq_entry *entryp );
 213 static int              do_aio_read( aio_workq_entry *entryp );
 214 static int              do_aio_write( aio_workq_entry *entryp );
 215 static void             do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 216 static void             do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 217 static int      lio_create_entry(proc_t procp,
 218                                          user_addr_t aiocbp,
 219                                          void *group_tag,
 220                                          aio_workq_entry **entrypp );
 221 static aio_workq_entry *aio_create_queue_entry(proc_t procp,
 222                                         user_addr_t aiocbp,
 223                                         void *group_tag,
 224                                         int kindOfIO);
 225 static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
 226 static void             free_lio_context(aio_lio_context* context);
 227 static void             aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
 228
 229 #define ASSERT_AIO_PROC_LOCK_OWNED(p)   lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
 230 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q)  lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
 231 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e)  lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
 232
 233 /*
 234  *  EXTERNAL PROTOTYPES
 235  */
 236
 237 /* in ...bsd/kern/sys_generic.c */
 238 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
 239                         user_addr_t bufp, user_size_t nbyte,
 240                         off_t offset, int flags, user_ssize_t *retval );
 241 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 242                          user_addr_t bufp, user_size_t nbyte, off_t offset,
 243                          int flags, user_ssize_t *retval );
 244 #if DEBUG
 245 static uint32_t                         lio_contexts_alloced = 0;
 246 #endif  /* DEBUG */
 247
 248 /*
 249  * aio external global variables.
 250  */
 251 extern int aio_max_requests;                    /* AIO_MAX - configurable */
 252 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 253 extern int aio_worker_threads;                  /* AIO_THREAD_COUNT - configurable */
 254
 255
 256 /*
 257  * aio static variables.
 258  */
 259 static aio_anchor_cb    aio_anchor;
 260 static lck_grp_t        *aio_proc_lock_grp;
 261 static lck_grp_t        *aio_entry_lock_grp;
 262 static lck_grp_t        *aio_queue_lock_grp;
 263 static lck_attr_t       *aio_lock_attr;
 264 static lck_grp_attr_t   *aio_lock_grp_attr;
 265 static struct zone      *aio_workq_zonep;
 266 static lck_mtx_t        aio_entry_mtx;
 267 static lck_mtx_t        aio_proc_mtx;
 268
 269 static void
 270 aio_entry_lock(__unused aio_workq_entry *entryp)
 271 {
 272         lck_mtx_lock(&aio_entry_mtx);
 273 }
 274
 275 static void
 276 aio_entry_lock_spin(__unused aio_workq_entry *entryp)
 277 {
 278         lck_mtx_lock_spin(&aio_entry_mtx);
 279 }
 280
 281 static void
 282 aio_entry_unlock(__unused aio_workq_entry *entryp)
 283 {
 284         lck_mtx_unlock(&aio_entry_mtx);
 285 }
 286
 287 /* Hash */
 288 static aio_workq_t
 289 aio_entry_workq(__unused aio_workq_entry *entryp)
 290 {
 291         return &aio_anchor.aio_async_workqs[0];
 292 }
 293
 294 static lck_mtx_t*
 295 aio_entry_mutex(__unused aio_workq_entry *entryp)
 296 {
 297         return &aio_entry_mtx;
 298 }
 299
 300 static void
 301 aio_workq_init(aio_workq_t wq)
 302 {
 303         TAILQ_INIT(&wq->aioq_entries);
 304         wq->aioq_count = 0;
 305         lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
 306         wq->aioq_waitq = wait_queue_alloc(SYNC_POLICY_FIFO);
 307 }
 308
 309
 310 /*
 311  * Can be passed a queue which is locked spin.
 312  */
 313 static void
 314 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 315 {
 316         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 317
 318         if (entryp->aio_workq_link.tqe_prev == NULL) {
 319                 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
 320         }
 321
 322         TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
 323         queue->aioq_count--;
 324         entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
 325
 326         if (queue->aioq_count  < 0) {
 327                 panic("Negative count on a queue.\n");
 328         }
 329 }
 330
 331 static void
 332 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 333 {
 334         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 335
 336         TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
 337         if (queue->aioq_count  < 0) {
 338                 panic("Negative count on a queue.\n");
 339         }
 340         queue->aioq_count++;
 341 }
 342
 343 static void
 344 aio_proc_lock(proc_t procp)
 345 {
 346         lck_mtx_lock(aio_proc_mutex(procp));
 347 }
 348
 349 static void
 350 aio_proc_lock_spin(proc_t procp)
 351 {
 352         lck_mtx_lock_spin(aio_proc_mutex(procp));
 353 }
 354
 355 static void
 356 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
 357 {
 358         ASSERT_AIO_PROC_LOCK_OWNED(procp);
 359
 360         TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
 361         TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
 362         procp->p_aio_active_count--;
 363         OSIncrementAtomic(&aio_anchor.aio_done_count);
 364 }
 365
 366 static void
 367 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
 368 {
 369         TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
 370         OSDecrementAtomic(&aio_anchor.aio_done_count);
 371         aio_decrement_total_count();
 372         procp->p_aio_total_count--;
 373 }
 374
 375 static void
 376 aio_proc_unlock(proc_t procp)
 377 {
 378         lck_mtx_unlock(aio_proc_mutex(procp));
 379 }
 380
 381 static lck_mtx_t*
 382 aio_proc_mutex(proc_t procp)
 383 {
 384         return &procp->p_mlock;
 385 }
 386
 387 static void
 388 aio_entry_ref_locked(aio_workq_entry *entryp)
 389 {
 390         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 391
 392         if (entryp->aio_refcount < 0) {
 393                 panic("AIO workq entry with a negative refcount.\n");
 394         }
 395         entryp->aio_refcount++;
 396 }
 397
 398
 399 /* Return 1 if you've freed it */
 400 static void
 401 aio_entry_unref_locked(aio_workq_entry *entryp)
 402 {
 403         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 404
 405         entryp->aio_refcount--;
 406         if (entryp->aio_refcount < 0) {
 407                 panic("AIO workq entry with a negative refcount.\n");
 408         }
 409 }
 410
 411 static void
 412 aio_entry_ref(aio_workq_entry *entryp)
 413 {
 414         aio_entry_lock_spin(entryp);
 415         aio_entry_ref_locked(entryp);
 416         aio_entry_unlock(entryp);
 417 }
 418 static void
 419 aio_entry_unref(aio_workq_entry *entryp)
 420 {
 421         aio_entry_lock_spin(entryp);
 422         aio_entry_unref_locked(entryp);
 423
 424         if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
 425                 aio_entry_unlock(entryp);
 426                 aio_free_request(entryp);
 427         } else {
 428                 aio_entry_unlock(entryp);
 429         }
 430
 431         return;
 432 }
 433
 434 static void
 435 aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
 436 {
 437         aio_entry_lock_spin(entryp);
 438
 439         if (cancelled) {
 440                 aio_entry_ref_locked(entryp);
 441                 entryp->errorval = ECANCELED;
 442                 entryp->returnval = -1;
 443         }
 444
 445         if ( wait_for_completion ) {
 446                 entryp->flags |= wait_for_completion; /* flag for special completion processing */
 447         }
 448
 449         if ( disable_notification ) {
 450                 entryp->flags |= AIO_DISABLE; /* Don't want a signal */
 451         }
 452
 453         aio_entry_unlock(entryp);
 454 }
 455
 456 static int
 457 aio_entry_try_workq_remove(aio_workq_entry *entryp)
 458 {
 459         /* Can only be cancelled if it's still on a work queue */
 460         if (entryp->aio_workq_link.tqe_prev != NULL) {
 461                 aio_workq_t queue;
 462
 463                 /* Will have to check again under the lock */
 464                 queue = aio_entry_workq(entryp);
 465                 aio_workq_lock_spin(queue);
 466                 if (entryp->aio_workq_link.tqe_prev != NULL) {
 467                         aio_workq_remove_entry_locked(queue, entryp);
 468                         aio_workq_unlock(queue);
 469                         return 1;
 470                 }  else {
 471                         aio_workq_unlock(queue);
 472                 }
 473         }
 474
 475         return 0;
 476 }
 477
 478 static void
 479 aio_workq_lock_spin(aio_workq_t wq)
 480 {
 481         lck_mtx_lock_spin(aio_workq_mutex(wq));
 482 }
 483
 484 static void
 485 aio_workq_unlock(aio_workq_t wq)
 486 {
 487         lck_mtx_unlock(aio_workq_mutex(wq));
 488 }
 489
 490 static lck_mtx_t*
 491 aio_workq_mutex(aio_workq_t wq)
 492 {
 493         return &wq->aioq_mtx;
 494 }
 495
 496 /*
 497  * aio_cancel - attempt to cancel one or more async IO requests currently
 498  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 499  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 500  * is NULL then all outstanding async IO request for the given file
 501  * descriptor are cancelled (if possible).
 502  */
 503 int
 504 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
 505 {
 506         struct user_aiocb               my_aiocb;
 507         int                                                     result;
 508
 509         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 510                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 511
 512         /* quick check to see if there are any async IO requests queued up */
 513         if (aio_get_all_queues_count() < 1) {
 514                 result = 0;
 515                 *retval = AIO_ALLDONE;
 516                 goto ExitRoutine;
 517         }
 518
 519         *retval = -1;
 520         if ( uap->aiocbp != USER_ADDR_NULL ) {
 521                 if ( proc_is64bit(p) ) {
 522                         struct user64_aiocb aiocb64;
 523
 524                         result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
 525                         if (result == 0 )
 526                                 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
 527
 528                 } else {
 529                         struct user32_aiocb aiocb32;
 530
 531                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
 532                         if ( result == 0 )
 533                                 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
 534                 }
 535
 536                 if ( result != 0 ) {
 537                         result = EAGAIN;
 538                         goto ExitRoutine;
 539                 }
 540
 541                 /* NOTE - POSIX standard says a mismatch between the file */
 542                 /* descriptor passed in and the file descriptor embedded in */
 543                 /* the aiocb causes unspecified results.  We return EBADF in */
 544                 /* that situation.  */
 545                 if ( uap->fd != my_aiocb.aio_fildes ) {
 546                         result = EBADF;
 547                         goto ExitRoutine;
 548                 }
 549         }
 550
 551         aio_proc_lock(p);
 552         result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
 553         ASSERT_AIO_PROC_LOCK_OWNED(p);
 554         aio_proc_unlock(p);
 555
 556         if ( result != -1 ) {
 557                 *retval = result;
 558                 result = 0;
 559                 goto ExitRoutine;
 560         }
 561
 562         result = EBADF;
 563
 564 ExitRoutine:
 565         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 566                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 567
 568         return( result );
 569
 570 } /* aio_cancel */
 571
 572
 573 /*
 574  * _aio_close - internal function used to clean up async IO requests for
 575  * a file descriptor that is closing.
 576  * THIS MAY BLOCK.
 577  */
 578 __private_extern__ void
 579 _aio_close(proc_t p, int fd )
 580 {
 581         int                     error;
 582
 583         /* quick check to see if there are any async IO requests queued up */
 584         if (aio_get_all_queues_count() < 1) {
 585                 return;
 586         }
 587
 588         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 589                           (int)p, fd, 0, 0, 0 );
 590
 591         /* cancel all async IO requests on our todo queues for this file descriptor */
 592         aio_proc_lock(p);
 593         error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
 594         ASSERT_AIO_PROC_LOCK_OWNED(p);
 595         if ( error == AIO_NOTCANCELED ) {
 596                 /*
 597                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 598                  * and file descriptor on the active async IO queue.  Active requests cannot
 599                  * be cancelled so we must wait for them to complete.  We will get a special
 600                  * wake up call on our channel used to sleep for ALL active requests to
 601                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 602                  * when we must wait for all active aio requests.
 603                  */
 604
 605                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 606                                   (int)p, fd, 0, 0, 0 );
 607
 608                 while (aio_proc_active_requests_for_file(p, fd) > 0) {
 609                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO | PDROP, "aio_close", 0 );
 610                 }
 611
 612         } else {
 613                 aio_proc_unlock(p);
 614         }
 615
 616
 617         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 618                           (int)p, fd, 0, 0, 0 );
 619
 620         return;
 621
 622 } /* _aio_close */
 623
 624
 625 /*
 626  * aio_error - return the error status associated with the async IO
 627  * request referred to by uap->aiocbp.  The error status is the errno
 628  * value that would be set by the corresponding IO request (read, wrtie,
 629  * fdatasync, or sync).
 630  */
 631 int
 632 aio_error(proc_t p, struct aio_error_args *uap, int *retval )
 633 {
 634         aio_workq_entry                         *entryp;
 635         int                                                     error;
 636
 637         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 638                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 639
 640         /* see if there are any aios to check */
 641         if (aio_get_all_queues_count() < 1) {
 642                 return EINVAL;
 643         }
 644
 645         aio_proc_lock(p);
 646
 647         /* look for a match on our queue of async IO requests that have completed */
 648         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 649                 if ( entryp->uaiocbp == uap->aiocbp ) {
 650                         ASSERT_AIO_FROM_PROC(entryp, p);
 651
 652                         aio_entry_lock_spin(entryp);
 653                         *retval = entryp->errorval;
 654                         error = 0;
 655                         aio_entry_unlock(entryp);
 656                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 657                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 658                         goto ExitRoutine;
 659                 }
 660         }
 661
 662         /* look for a match on our queue of active async IO requests */
 663         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 664                 if ( entryp->uaiocbp == uap->aiocbp ) {
 665                         ASSERT_AIO_FROM_PROC(entryp, p);
 666                         *retval = EINPROGRESS;
 667                         error = 0;
 668                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 669                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 670                         goto ExitRoutine;
 671                 }
 672         }
 673
 674         error = EINVAL;
 675
 676 ExitRoutine:
 677         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 678                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 679         aio_proc_unlock(p);
 680
 681         return( error );
 682
 683 } /* aio_error */
 684
 685
 686 /*
 687  * aio_fsync - asynchronously force all IO operations associated
 688  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 689  * queued at the time of the call to the synchronized completion state.
 690  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 691  * fdatasync() call.
 692  */
 693 int
 694 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
 695 {
 696         int                     error;
 697         int                     fsync_kind;
 698
 699         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 700                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 701
 702         *retval = 0;
 703         /* 0 := O_SYNC for binary backward compatibility with Panther */
 704         if (uap->op == O_SYNC || uap->op == 0)
 705                 fsync_kind = AIO_FSYNC;
 706         else if ( uap->op == O_DSYNC )
 707                 fsync_kind = AIO_DSYNC;
 708         else {
 709                 *retval = -1;
 710                 error = EINVAL;
 711                 goto ExitRoutine;
 712         }
 713
 714         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 715         if ( error != 0 )
 716                 *retval = -1;
 717
 718 ExitRoutine:
 719         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 720                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 721
 722         return( error );
 723
 724 } /* aio_fsync */
 725
 726
 727 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 728  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 729  * (uap->aiocbp->aio_buf).
 730  */
 731 int
 732 aio_read(proc_t p, struct aio_read_args *uap, int *retval )
 733 {
 734         int                     error;
 735
 736         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 737                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 738
 739         *retval = 0;
 740
 741         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 742         if ( error != 0 )
 743                 *retval = -1;
 744
 745         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 746                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 747
 748         return( error );
 749
 750 } /* aio_read */
 751
 752
 753 /*
 754  * aio_return - return the return status associated with the async IO
 755  * request referred to by uap->aiocbp.  The return status is the value
 756  * that would be returned by corresponding IO request (read, write,
 757  * fdatasync, or sync).  This is where we release kernel resources
 758  * held for async IO call associated with the given aiocb pointer.
 759  */
 760 int
 761 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
 762 {
 763         aio_workq_entry                         *entryp;
 764         int                                                     error;
 765         boolean_t                                       proc_lock_held = FALSE;
 766
 767         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 768                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 769
 770         /* See if there are any entries to check */
 771         if (aio_get_all_queues_count() < 1) {
 772                 error = EINVAL;
 773                 goto ExitRoutine;
 774         }
 775
 776         aio_proc_lock(p);
 777         proc_lock_held = TRUE;
 778         *retval = 0;
 779
 780         /* look for a match on our queue of async IO requests that have completed */
 781         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 782                 ASSERT_AIO_FROM_PROC(entryp, p);
 783                 if ( entryp->uaiocbp == uap->aiocbp ) {
 784                         /* Done and valid for aio_return(), pull it off the list */
 785                         aio_proc_remove_done_locked(p, entryp);
 786
 787                         /* Drop the proc lock, but keep the entry locked */
 788                         aio_entry_lock(entryp);
 789                         aio_proc_unlock(p);
 790                         proc_lock_held = FALSE;
 791
 792                         *retval = entryp->returnval;
 793                         error = 0;
 794
 795                         /* No references and off all lists, safe to free */
 796                         if (entryp->aio_refcount == 0) {
 797                                 aio_entry_unlock(entryp);
 798                                 aio_free_request(entryp);
 799                         }
 800                         else {
 801                                 /* Whoever has the refcount will have to free it */
 802                                 entryp->flags |= AIO_DO_FREE;
 803                                 aio_entry_unlock(entryp);
 804                         }
 805
 806
 807                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 808                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 809                         goto ExitRoutine;
 810                 }
 811         }
 812
 813         /* look for a match on our queue of active async IO requests */
 814         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 815                 ASSERT_AIO_FROM_PROC(entryp, p);
 816                 if ( entryp->uaiocbp == uap->aiocbp ) {
 817                         error = EINPROGRESS;
 818                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 819                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 820                         goto ExitRoutine;
 821                 }
 822         }
 823
 824         error = EINVAL;
 825
 826 ExitRoutine:
 827         if (proc_lock_held)
 828                 aio_proc_unlock(p);
 829         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 830                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 831
 832         return( error );
 833
 834 } /* aio_return */
 835
 836
 837 /*
 838  * _aio_exec - internal function used to clean up async IO requests for
 839  * a process that is going away due to exec().  We cancel any async IOs
 840  * we can and wait for those already active.  We also disable signaling
 841  * for cancelled or active aio requests that complete.
 842  * This routine MAY block!
 843  */
 844 __private_extern__ void
 845 _aio_exec(proc_t p )
 846 {
 847
 848         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 849                           (int)p, 0, 0, 0, 0 );
 850
 851         _aio_exit( p );
 852
 853         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 854                           (int)p, 0, 0, 0, 0 );
 855
 856         return;
 857
 858 } /* _aio_exec */
 859
 860
 861 /*
 862  * _aio_exit - internal function used to clean up async IO requests for
 863  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 864  * we can and wait for those already active.  We also disable signaling
 865  * for cancelled or active aio requests that complete.  This routine MAY block!
 866  */
 867 __private_extern__ void
 868 _aio_exit(proc_t p )
 869 {
 870         int                                             error;
 871         aio_workq_entry                 *entryp;
 872
 873
 874         /* quick check to see if there are any async IO requests queued up */
 875         if (aio_get_all_queues_count() < 1) {
 876                 return;
 877         }
 878
 879         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 880                           (int)p, 0, 0, 0, 0 );
 881
 882         aio_proc_lock(p);
 883
 884         /*
 885          * cancel async IO requests on the todo work queue and wait for those
 886          * already active to complete.
 887          */
 888         error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
 889         ASSERT_AIO_PROC_LOCK_OWNED(p);
 890         if ( error == AIO_NOTCANCELED ) {
 891                 /*
 892                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 893                  * on the active async IO queue.  Active requests cannot be cancelled so we
 894                  * must wait for them to complete.  We will get a special wake up call on
 895                  * our channel used to sleep for ALL active requests to complete.  This sleep
 896                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 897                  * active aio requests.
 898                  */
 899
 900                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 901                                   (int)p, 0, 0, 0, 0 );
 902
 903                 while (p->p_aio_active_count != 0) {
 904                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
 905                 }
 906         }
 907
 908         if (p->p_aio_active_count != 0) {
 909                 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
 910         }
 911
 912         /* release all aio resources used by this process */
 913         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 914         while ( entryp != NULL ) {
 915                 ASSERT_AIO_FROM_PROC(entryp, p);
 916                 aio_workq_entry                 *next_entryp;
 917
 918                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 919                 aio_proc_remove_done_locked(p, entryp);
 920
 921                 /* we cannot free requests that are still completing */
 922                 aio_entry_lock_spin(entryp);
 923                 if (entryp->aio_refcount == 0) {
 924                         aio_proc_unlock(p);
 925                         aio_entry_unlock(entryp);
 926                         aio_free_request(entryp);
 927
 928                         /* need to start over since aio_doneq may have been */
 929                         /* changed while we were away.  */
 930                         aio_proc_lock(p);
 931                         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 932                         continue;
 933                 }
 934                 else {
 935                         /* whoever has the reference will have to do the free */
 936                         entryp->flags |= AIO_DO_FREE;
 937                 }
 938
 939                 aio_entry_unlock(entryp);
 940                 entryp = next_entryp;
 941         }
 942
 943         aio_proc_unlock(p);
 944
 945         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 946                           (int)p, 0, 0, 0, 0 );
 947         return;
 948
 949 } /* _aio_exit */
 950
 951
 952 static boolean_t
 953 should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
 954 {
 955         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 956                         (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 957                         (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 958                 return TRUE;
 959         }
 960
 961         return FALSE;
 962 }
 963
 964 /*
 965  * do_aio_cancel_locked - cancel async IO requests (if possible).  We get called by
 966  * aio_cancel, close, and at exit.
 967  * There are three modes of operation: 1) cancel all async IOs for a process -
 968  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 969  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 970  * aiocbp.
 971  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 972  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 973  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 974  * were already complete.
 975  * WARNING - do not deference aiocbp in this routine, it may point to user
 976  * land data that has not been copied in (when called from aio_cancel() )
 977  *
 978  * Called with proc locked, and returns the same way.
 979  */
 980 static int
 981 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
 982         int wait_for_completion, boolean_t disable_notification )
 983 {
 984         ASSERT_AIO_PROC_LOCK_OWNED(p);
 985
 986         aio_workq_entry                 *entryp;
 987         int                                             result;
 988
 989         result = -1;
 990
 991         /* look for a match on our queue of async todo work. */
 992         entryp = TAILQ_FIRST(&p->p_aio_activeq);
 993         while ( entryp != NULL ) {
 994                 ASSERT_AIO_FROM_PROC(entryp, p);
 995                 aio_workq_entry                 *next_entryp;
 996
 997                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 998                 if (!should_cancel(entryp, aiocbp, fd)) {
 999                         entryp = next_entryp;
1000                         continue;
1001                 }
1002
1003                 /* Can only be cancelled if it's still on a work queue */
1004                 if (aio_entry_try_workq_remove(entryp) != 0) {
1005                         /* Have removed from workq. Update entry state and take a ref */
1006                         aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
1007
1008                         /* Put on the proc done queue and update counts, then unlock the proc */
1009                         aio_proc_move_done_locked(p, entryp);
1010                         aio_proc_unlock(p);
1011
1012                         /* Now it's officially cancelled.  Do the completion */
1013                         result = AIO_CANCELED;
1014                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1015                                         (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1016                         do_aio_completion(entryp);
1017
1018                         /* This will free if the aio_return() has already happened ... */
1019                         aio_entry_unref(entryp);
1020                         aio_proc_lock(p);
1021
1022                         if ( aiocbp != USER_ADDR_NULL ) {
1023                                 return( result );
1024                         }
1025
1026                         /*
1027                          * Restart from the head of the proc active queue since it
1028                          * may have been changed while we were away doing completion
1029                          * processing.
1030                          *
1031                          * Note that if we found an uncancellable AIO before, we will
1032                          * either find it again or discover that it's been completed,
1033                          * so resetting the result will not cause us to return success
1034                          * despite outstanding AIOs.
1035                          */
1036                         entryp = TAILQ_FIRST(&p->p_aio_activeq);
1037                         result = -1; /* As if beginning anew */
1038                 } else {
1039                         /*
1040                          * It's been taken off the active queue already, i.e. is in flight.
1041                          * All we can do is ask for notification.
1042                          */
1043                         result = AIO_NOTCANCELED;
1044
1045                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
1046                                         (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1047
1048                         /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1049                         aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
1050
1051                         if ( aiocbp != USER_ADDR_NULL ) {
1052                                 return( result );
1053                         }
1054                         entryp = next_entryp;
1055                 }
1056         } /* while... */
1057
1058         /*
1059          * if we didn't find any matches on the todo or active queues then look for a
1060          * match on our queue of async IO requests that have completed and if found
1061          * return AIO_ALLDONE result.
1062          *
1063          * Proc AIO lock is still held.
1064          */
1065         if ( result == -1 ) {
1066                 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1067                         ASSERT_AIO_FROM_PROC(entryp, p);
1068                         if (should_cancel(entryp, aiocbp, fd)) {
1069                                 result = AIO_ALLDONE;
1070                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
1071                                                 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1072
1073                                 if ( aiocbp != USER_ADDR_NULL ) {
1074                                         return( result );
1075                                 }
1076                         }
1077                 }
1078         }
1079
1080         return( result );
1081
1082 }
1083  /* do_aio_cancel_locked */
1084
1085
1086 /*
1087  * aio_suspend - suspend the calling thread until at least one of the async
1088  * IO operations referenced by uap->aiocblist has completed, until a signal
1089  * interrupts the function, or uap->timeoutp time interval (optional) has
1090  * passed.
1091  * Returns 0 if one or more async IOs have completed else -1 and errno is
1092  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1093  * woke us up.
1094  */
1095 int
1096 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1097 {
1098         __pthread_testcancel(1);
1099         return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
1100 }
1101
1102
1103 int
1104 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
1105 {
1106         int                                     error;
1107         int                                     i, count;
1108         uint64_t                        abstime;
1109         struct user_timespec ts;
1110         aio_workq_entry         *entryp;
1111         user_addr_t                     *aiocbpp;
1112
1113         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1114                           (int)p, uap->nent, 0, 0, 0 );
1115
1116         *retval = -1;
1117         abstime = 0;
1118         aiocbpp = NULL;
1119
1120         count = aio_get_all_queues_count( );
1121         if ( count < 1 ) {
1122                 error = EINVAL;
1123                 goto ExitThisRoutine;
1124         }
1125
1126         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
1127                 error = EINVAL;
1128                 goto ExitThisRoutine;
1129         }
1130
1131         if ( uap->timeoutp != USER_ADDR_NULL ) {
1132                 if ( proc_is64bit(p) ) {
1133                         struct user64_timespec temp;
1134                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1135                         if ( error == 0 ) {
1136                                 ts.tv_sec = temp.tv_sec;
1137                                 ts.tv_nsec = temp.tv_nsec;
1138                         }
1139                 }
1140                 else {
1141                         struct user32_timespec temp;
1142                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1143                         if ( error == 0 ) {
1144                                 ts.tv_sec = temp.tv_sec;
1145                                 ts.tv_nsec = temp.tv_nsec;
1146                         }
1147                 }
1148                 if ( error != 0 ) {
1149                         error = EAGAIN;
1150                         goto ExitThisRoutine;
1151                 }
1152
1153                 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
1154                         error = EINVAL;
1155                         goto ExitThisRoutine;
1156                 }
1157
1158                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1159                                                                          &abstime );
1160                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
1161         }
1162
1163         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1164         if ( aiocbpp == NULL ) {
1165                 error = EAGAIN;
1166                 goto ExitThisRoutine;
1167         }
1168
1169         /* check list of aio requests to see if any have completed */
1170 check_for_our_aiocbp:
1171         aio_proc_lock_spin(p);
1172         for ( i = 0; i < uap->nent; i++ ) {
1173                 user_addr_t     aiocbp;
1174
1175                 /* NULL elements are legal so check for 'em */
1176                 aiocbp = *(aiocbpp + i);
1177                 if ( aiocbp == USER_ADDR_NULL )
1178                         continue;
1179
1180                 /* return immediately if any aio request in the list is done */
1181                 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1182                         ASSERT_AIO_FROM_PROC(entryp, p);
1183                         if ( entryp->uaiocbp == aiocbp ) {
1184                                 aio_proc_unlock(p);
1185                                 *retval = 0;
1186                                 error = 0;
1187                                 goto ExitThisRoutine;
1188                         }
1189                 }
1190         } /* for ( ; i < uap->nent; ) */
1191
1192         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1193                           (int)p, uap->nent, 0, 0, 0 );
1194
1195         /*
1196          * wait for an async IO to complete or a signal fires or timeout expires.
1197          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1198          * interrupts us.  If an async IO completes before a signal fires or our
1199          * timeout expires, we get a wakeup call from aio_work_thread().
1200          */
1201
1202         error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
1203         if ( error == 0 ) {
1204                 /*
1205                  * got our wakeup call from aio_work_thread().
1206                  * Since we can get a wakeup on this channel from another thread in the
1207                  * same process we head back up to make sure this is for the correct aiocbp.
1208                  * If it is the correct aiocbp we will return from where we do the check
1209                  * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1210                  * else we will fall out and just sleep again.
1211                  */
1212                 goto check_for_our_aiocbp;
1213         }
1214         else if ( error == EWOULDBLOCK ) {
1215                 /* our timeout expired */
1216                 error = EAGAIN;
1217         }
1218         else {
1219                 /* we were interrupted */
1220                 error = EINTR;
1221         }
1222
1223 ExitThisRoutine:
1224         if ( aiocbpp != NULL )
1225                 FREE( aiocbpp, M_TEMP );
1226
1227         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1228                           (int)p, uap->nent, error, 0, 0 );
1229
1230         return( error );
1231
1232 } /* aio_suspend */
1233
1234
1235 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1236  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1237  * (uap->aiocbp->aio_buf).
1238  */
1239
1240 int
1241 aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1242 {
1243         int                     error;
1244
1245         *retval = 0;
1246
1247         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1248                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1249
1250         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1251         if ( error != 0 )
1252                 *retval = -1;
1253
1254         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1255                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1256
1257         return( error );
1258
1259 } /* aio_write */
1260
1261
1262 static user_addr_t *
1263 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
1264 {
1265         user_addr_t     *aiocbpp;
1266         int             i, result;
1267
1268         /* we reserve enough space for largest possible pointer size */
1269         MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1270         if ( aiocbpp == NULL )
1271                 goto err;
1272
1273         /* copyin our aiocb pointers from list */
1274         result = copyin( aiocblist, aiocbpp,
1275                         proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1276                                             : (nent * sizeof(user32_addr_t)) );
1277         if ( result) {
1278                 FREE( aiocbpp, M_TEMP );
1279                 aiocbpp = NULL;
1280                 goto err;
1281         }
1282
1283         /*
1284          * We depend on a list of user_addr_t's so we need to
1285          * munge and expand when these pointers came from a
1286          * 32-bit process
1287          */
1288         if ( !proc_is64bit(procp) ) {
1289                 /* copy from last to first to deal with overlap */
1290                 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1291                 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1292
1293                 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1294                         *my_addrp = (user_addr_t) (*my_ptrp);
1295                 }
1296         }
1297
1298 err:
1299         return (aiocbpp);
1300 }
1301
1302
1303 static int
1304 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1305 {
1306         int     result = 0;
1307
1308         if (sigp == USER_ADDR_NULL)
1309                 goto out;
1310
1311         /*
1312          * We need to munge aio_sigevent since it contains pointers.
1313          * Since we do not know if sigev_value is an int or a ptr we do
1314          * NOT cast the ptr to a user_addr_t.   This means if we send
1315          * this info back to user space we need to remember sigev_value
1316          * was not expanded for the 32-bit case.
1317          *
1318          * Notes:        This does NOT affect us since we don't support
1319          *              sigev_value yet in the aio context.
1320          */
1321         if ( proc_is64bit(procp) ) {
1322                 struct user64_sigevent sigevent64;
1323
1324                 result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
1325                 if ( result == 0 ) {
1326                         sigev->sigev_notify = sigevent64.sigev_notify;
1327                         sigev->sigev_signo = sigevent64.sigev_signo;
1328                         sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1329                         sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1330                         sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1331                 }
1332
1333         } else {
1334                 struct user32_sigevent sigevent32;
1335
1336                 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1337                 if ( result == 0 ) {
1338                         sigev->sigev_notify = sigevent32.sigev_notify;
1339                         sigev->sigev_signo = sigevent32.sigev_signo;
1340                         sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1341                         sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1342                         sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1343                 }
1344         }
1345
1346         if ( result != 0 ) {
1347                 result = EAGAIN;
1348         }
1349
1350 out:
1351         return (result);
1352 }
1353
1354 /*
1355  * aio_enqueue_work
1356  *
1357  * Queue up the entry on the aio asynchronous work queue in priority order
1358  * based on the relative priority of the request.  We calculate the relative
1359  * priority using the nice value of the caller and the value
1360  *
1361  * Parameters:  procp                   Process queueing the I/O
1362  *              entryp                  The work queue entry being queued
1363  *
1364  * Returns:     (void)                  No failure modes
1365  *
1366  * Notes:       This function is used for both lio_listio and aio
1367  *
1368  * XXX:         At some point, we may have to consider thread priority
1369  *              rather than process priority, but we don't maintain the
1370  *              adjusted priority for threads the POSIX way.
1371  *
1372  *
1373  * Called with proc locked.
1374  */
1375 static void
1376 aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1377 {
1378 #if 0
1379         aio_workq_entry *my_entryp;     /* used for insertion sort */
1380 #endif /* 0 */
1381         aio_workq_t queue = aio_entry_workq(entryp);
1382
1383         if (proc_locked == 0) {
1384                 aio_proc_lock(procp);
1385         }
1386
1387         ASSERT_AIO_PROC_LOCK_OWNED(procp);
1388
1389         /* Onto proc queue */
1390         TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp,  aio_proc_link);
1391         procp->p_aio_active_count++;
1392         procp->p_aio_total_count++;
1393
1394         /* And work queue */
1395         aio_workq_lock_spin(queue);
1396         aio_workq_add_entry_locked(queue, entryp);
1397         wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED);
1398         aio_workq_unlock(queue);
1399
1400         if (proc_locked == 0) {
1401                 aio_proc_unlock(procp);
1402         }
1403
1404 #if 0
1405         /*
1406          * Procedure:
1407          *
1408          * (1)  The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1409          * (2)  The normalized nice value is in the range 0..((2 * NZERO) - 1)
1410          *      which is [0..39], with 0 not being used.  In nice values, the
1411          *      lower the nice value, the higher the priority.
1412          * (3)  The normalized scheduling prioritiy is the highest nice value
1413          *      minus the current nice value.  In I/O scheduling priority, the
1414          *      higher the value the lower the priority, so it is the inverse
1415          *      of the nice value (the higher the number, the higher the I/O
1416          *      priority).
1417          * (4)  From the normalized scheduling priority, we subtract the
1418          *      request priority to get the request priority value number;
1419          *      this means that requests are only capable of depressing their
1420          *      priority relative to other requests,
1421          */
1422         entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1423
1424         /* only premit depressing the priority */
1425         if (entryp->aiocb.aio_reqprio < 0)
1426                 entryp->aiocb.aio_reqprio = 0;
1427         if (entryp->aiocb.aio_reqprio > 0) {
1428                 entryp->priority -= entryp->aiocb.aio_reqprio;
1429                 if (entryp->priority < 0)
1430                         entryp->priority = 0;
1431         }
1432
1433         /* Insertion sort the entry; lowest ->priority to highest */
1434         TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1435                 if ( entryp->priority <= my_entryp->priority) {
1436                         TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1437                         break;
1438                 }
1439         }
1440         if (my_entryp == NULL)
1441                 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1442 #endif /* 0 */
1443 }
1444
1445
1446 /*
1447  * lio_listio - initiate a list of IO requests.  We process the list of
1448  * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1449  * (mode == LIO_NOWAIT).
1450  *
1451  * The caller gets error and return status for each aiocb in the list
1452  * via aio_error and aio_return.  We must keep completed requests until
1453  * released by the aio_return call.
1454  */
1455 int
1456 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1457 {
1458         int                             i;
1459         int                             call_result;
1460         int                             result;
1461         int                             old_count;
1462         aio_workq_entry                 **entryp_listp;
1463         user_addr_t                     *aiocbpp;
1464         struct user_sigevent            aiosigev;
1465         aio_lio_context         *lio_context;
1466         boolean_t                       free_context = FALSE;
1467
1468         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1469                           (int)p, uap->nent, uap->mode, 0, 0 );
1470
1471         entryp_listp = NULL;
1472         lio_context = NULL;
1473         aiocbpp = NULL;
1474         call_result = -1;
1475         *retval = -1;
1476         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1477                 call_result = EINVAL;
1478                 goto ExitRoutine;
1479         }
1480
1481         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1482                 call_result = EINVAL;
1483                 goto ExitRoutine;
1484         }
1485
1486         /*
1487          * allocate a list of aio_workq_entry pointers that we will use
1488          * to queue up all our requests at once while holding our lock.
1489          */
1490         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1491         if ( entryp_listp == NULL ) {
1492                 call_result = EAGAIN;
1493                 goto ExitRoutine;
1494         }
1495
1496         MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1497         if ( lio_context == NULL ) {
1498                 call_result = EAGAIN;
1499                 goto ExitRoutine;
1500         }
1501
1502 #if DEBUG
1503         OSIncrementAtomic(&lio_contexts_alloced);
1504 #endif /* DEBUG */
1505
1506         bzero(lio_context, sizeof(aio_lio_context));
1507
1508         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1509         if ( aiocbpp == NULL ) {
1510                 call_result = EAGAIN;
1511                 goto ExitRoutine;
1512         }
1513
1514         /*
1515          * Use sigevent passed in to lio_listio for each of our calls, but
1516          * only do completion notification after the last request completes.
1517          */
1518         bzero(&aiosigev, sizeof(aiosigev));
1519         /* Only copy in an sigev if the user supplied one */
1520         if (uap->sigp != USER_ADDR_NULL) {
1521                 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1522                 if ( call_result)
1523                         goto ExitRoutine;
1524         }
1525
1526         /* process list of aio requests */
1527         lio_context->io_issued = uap->nent;
1528         lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
1529         for ( i = 0; i < uap->nent; i++ ) {
1530                 user_addr_t my_aiocbp;
1531                 aio_workq_entry                         *entryp;
1532
1533                 *(entryp_listp + i) = NULL;
1534                 my_aiocbp = *(aiocbpp + i);
1535
1536                 /* NULL elements are legal so check for 'em */
1537                 if ( my_aiocbp == USER_ADDR_NULL ) {
1538                         aio_proc_lock_spin(p);
1539                         lio_context->io_issued--;
1540                         aio_proc_unlock(p);
1541                         continue;
1542                 }
1543
1544                 /*
1545                  * We use lio_context to mark IO requests for delayed completion
1546                  * processing which means we wait until all IO requests in the
1547                  * group have completed before we either return to the caller
1548                  * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1549                  *
1550                  * We use the address of the lio_context for this, since it is
1551                  * unique in the address space.
1552                  */
1553                 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
1554                 if ( result != 0 && call_result == -1 )
1555                         call_result = result;
1556
1557                 /* NULL elements are legal so check for 'em */
1558                 entryp = *(entryp_listp + i);
1559                 if ( entryp == NULL ) {
1560                         aio_proc_lock_spin(p);
1561                         lio_context->io_issued--;
1562                         aio_proc_unlock(p);
1563                         continue;
1564                 }
1565
1566                 if ( uap->mode == LIO_NOWAIT ) {
1567                         /* Set signal hander, if any */
1568                         entryp->aiocb.aio_sigevent = aiosigev;
1569                 } else {
1570                         /* flag that this thread blocks pending completion */
1571                         entryp->flags |= AIO_LIO_NOTIFY;
1572                 }
1573
1574                 /* check our aio limits to throttle bad or rude user land behavior */
1575                 old_count = aio_increment_total_count();
1576
1577                 aio_proc_lock_spin(p);
1578                 if ( old_count >= aio_max_requests ||
1579                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1580                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1581
1582                         lio_context->io_issued--;
1583                         aio_proc_unlock(p);
1584
1585                         aio_decrement_total_count();
1586
1587                         if ( call_result == -1 )
1588                                 call_result = EAGAIN;
1589                         aio_free_request(entryp);
1590                         entryp_listp[i] = NULL;
1591                         continue;
1592                 }
1593
1594                 lck_mtx_convert_spin(aio_proc_mutex(p));
1595                 aio_enqueue_work(p, entryp, 1);
1596                 aio_proc_unlock(p);
1597
1598                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1599                                   (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1600         }
1601
1602         switch(uap->mode) {
1603         case LIO_WAIT:
1604                 aio_proc_lock_spin(p);
1605                 while (lio_context->io_completed < lio_context->io_issued) {
1606                         result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
1607
1608                         /* If we were interrupted, fail out (even if all finished) */
1609                         if (result != 0) {
1610                                 call_result = EINTR;
1611                                 lio_context->io_waiter = 0;
1612                                 break;
1613                         }
1614                 }
1615
1616                 /* If all IOs have finished must free it */
1617                 if (lio_context->io_completed == lio_context->io_issued) {
1618                         free_context = TRUE;
1619                 }
1620
1621                 aio_proc_unlock(p);
1622                 break;
1623
1624         case LIO_NOWAIT:
1625                 break;
1626         }
1627
1628         /* call_result == -1 means we had no trouble queueing up requests */
1629         if ( call_result == -1 ) {
1630                 call_result = 0;
1631                 *retval = 0;
1632         }
1633
1634 ExitRoutine:
1635         if ( entryp_listp != NULL )
1636                 FREE( entryp_listp, M_TEMP );
1637         if ( aiocbpp != NULL )
1638                 FREE( aiocbpp, M_TEMP );
1639         if ((lio_context != NULL) && ((lio_context->io_issued == 0) || (free_context == TRUE))) {
1640                 free_lio_context(lio_context);
1641         }
1642
1643         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1644                           (int)p, call_result, 0, 0, 0 );
1645
1646         return( call_result );
1647
1648 } /* lio_listio */
1649
1650
1651 /*
1652  * aio worker thread.  this is where all the real work gets done.
1653  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1654  * after new work is queued up.
1655  */
1656 static void
1657 aio_work_thread( void )
1658 {
1659         aio_workq_entry                 *entryp;
1660         int                     error;
1661         vm_map_t                currentmap;
1662         vm_map_t                oldmap = VM_MAP_NULL;
1663         task_t                  oldaiotask = TASK_NULL;
1664         struct uthread  *uthreadp = NULL;
1665
1666         for( ;; ) {
1667                 /*
1668                  * returns with the entry ref'ed.
1669                  * sleeps until work is available.
1670                  */
1671                 entryp = aio_get_some_work();
1672
1673                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1674                                 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1675
1676                 /*
1677                  * Assume the target's address space identity for the duration
1678                  * of the IO.  Note: don't need to have the entryp locked,
1679                  * because the proc and map don't change until it's freed.
1680                  */
1681                 currentmap = get_task_map( (current_proc())->task );
1682                 if ( currentmap != entryp->aio_map ) {
1683                         uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1684                         oldaiotask = uthreadp->uu_aio_task;
1685                         uthreadp->uu_aio_task = entryp->procp->task;
1686                         oldmap = vm_map_switch( entryp->aio_map );
1687                 }
1688
1689                 if ( (entryp->flags & AIO_READ) != 0 ) {
1690                         error = do_aio_read( entryp );
1691                 }
1692                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1693                         error = do_aio_write( entryp );
1694                 }
1695                 else if ( (entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0 ) {
1696                         error = do_aio_fsync( entryp );
1697                 }
1698                 else {
1699                         printf( "%s - unknown aio request - flags 0x%02X \n",
1700                                         __FUNCTION__, entryp->flags );
1701                         error = EINVAL;
1702                 }
1703
1704                 /* Restore old map */
1705                 if ( currentmap != entryp->aio_map ) {
1706                         (void) vm_map_switch( oldmap );
1707                         uthreadp->uu_aio_task = oldaiotask;
1708                 }
1709
1710                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1711                                 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1712                                 entryp->returnval, 0 );
1713
1714
1715                 /* XXX COUNTS */
1716                 aio_entry_lock_spin(entryp);
1717                 entryp->errorval = error;
1718                 aio_entry_unlock(entryp);
1719
1720                 /* we're done with the IO request so pop it off the active queue and */
1721                 /* push it on the done queue */
1722                 aio_proc_lock(entryp->procp);
1723                 aio_proc_move_done_locked(entryp->procp, entryp);
1724                 aio_proc_unlock(entryp->procp);
1725
1726                 OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1727
1728                 /* remove our reference to the user land map. */
1729                 if ( VM_MAP_NULL != entryp->aio_map ) {
1730                         vm_map_t                my_map;
1731
1732                         my_map = entryp->aio_map;
1733                         entryp->aio_map = VM_MAP_NULL;
1734                         vm_map_deallocate( my_map );
1735                 }
1736
1737                 /* Provide notifications */
1738                 do_aio_completion( entryp );
1739
1740                 /* Will free if needed */
1741                 aio_entry_unref(entryp);
1742
1743         } /* for ( ;; ) */
1744
1745         /* NOT REACHED */
1746
1747 } /* aio_work_thread */
1748
1749
1750 /*
1751  * aio_get_some_work - get the next async IO request that is ready to be executed.
1752  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1753  * IO requests at the time the aio_fsync call came in have completed.
1754  * NOTE - AIO_LOCK must be held by caller
1755  */
1756 static aio_workq_entry *
1757 aio_get_some_work( void )
1758 {
1759         aio_workq_entry                         *entryp = NULL;
1760         aio_workq_t                             queue = NULL;
1761
1762         /* Just one queue for the moment.  In the future there will be many. */
1763         queue = &aio_anchor.aio_async_workqs[0];
1764         aio_workq_lock_spin(queue);
1765         if (queue->aioq_count == 0) {
1766                 goto nowork;
1767         }
1768
1769         /*
1770          * Hold the queue lock.
1771          *
1772          * pop some work off the work queue and add to our active queue
1773          * Always start with the queue lock held.
1774          */
1775         for(;;) {
1776                 /*
1777                  * Pull of of work queue.  Once it's off, it can't be cancelled,
1778                  * so we can take our ref once we drop the queue lock.
1779                  */
1780                 entryp = TAILQ_FIRST(&queue->aioq_entries);
1781
1782                 /*
1783                  * If there's no work or only fsyncs that need delay, go to sleep
1784                  * and then start anew from aio_work_thread
1785                  */
1786                 if (entryp == NULL) {
1787                         goto nowork;
1788                 }
1789
1790                 aio_workq_remove_entry_locked(queue, entryp);
1791
1792                 aio_workq_unlock(queue);
1793
1794                 /*
1795                  * Check if it's an fsync that must be delayed.  No need to lock the entry;
1796                  * that flag would have been set at initialization.
1797                  */
1798                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1799                         /*
1800                          * Check for unfinished operations on the same file
1801                          * in this proc's queue.
1802                          */
1803                         aio_proc_lock_spin(entryp->procp);
1804                         if ( aio_delay_fsync_request( entryp ) ) {
1805                                 /* It needs to be delayed.  Put it back on the end of the work queue */
1806                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1807                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1808
1809                                 aio_proc_unlock(entryp->procp);
1810
1811                                 aio_workq_lock_spin(queue);
1812                                 aio_workq_add_entry_locked(queue, entryp);
1813                                 continue;
1814                         }
1815                         aio_proc_unlock(entryp->procp);
1816                 }
1817
1818                 break;
1819         }
1820
1821         aio_entry_ref(entryp);
1822
1823         OSIncrementAtomic(&aio_anchor.aio_inflight_count);
1824         return( entryp );
1825
1826 nowork:
1827         /* We will wake up when someone enqueues something */
1828         wait_queue_assert_wait(queue->aioq_waitq, queue, THREAD_UNINT, 0);
1829         aio_workq_unlock(queue);
1830         thread_block( (thread_continue_t)aio_work_thread );
1831
1832         // notreached
1833         return NULL;
1834 }
1835
1836 /*
1837  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1838  * A big, simple hammer: only send it off if it's the most recently filed IO which has
1839  * not been completed.
1840  */
1841 static boolean_t
1842 aio_delay_fsync_request( aio_workq_entry *entryp )
1843 {
1844         if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1845                 return FALSE;
1846         }
1847
1848         return TRUE;
1849 } /* aio_delay_fsync_request */
1850
1851 static aio_workq_entry *
1852 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
1853 {
1854         aio_workq_entry *entryp;
1855         int             result = 0;
1856
1857         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1858         if ( entryp == NULL ) {
1859                 result = EAGAIN;
1860                 goto error_exit;
1861         }
1862
1863         bzero( entryp, sizeof(*entryp) );
1864
1865         /* fill in the rest of the aio_workq_entry */
1866         entryp->procp = procp;
1867         entryp->uaiocbp = aiocbp;
1868         entryp->flags |= kindOfIO;
1869         entryp->group_tag = group_tag;
1870         entryp->aio_map = VM_MAP_NULL;
1871         entryp->aio_refcount = 0;
1872
1873         if ( proc_is64bit(procp) ) {
1874                 struct user64_aiocb aiocb64;
1875
1876                 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
1877                 if (result == 0 )
1878                         do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1879
1880         } else {
1881                 struct user32_aiocb aiocb32;
1882
1883                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1884                 if ( result == 0 )
1885                         do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
1886         }
1887
1888         if ( result != 0 ) {
1889                 result = EAGAIN;
1890                 goto error_exit;
1891         }
1892
1893         /* get a reference to the user land map in order to keep it around */
1894         entryp->aio_map = get_task_map( procp->task );
1895         vm_map_reference( entryp->aio_map );
1896
1897         /* do some more validation on the aiocb and embedded file descriptor */
1898         result = aio_validate( entryp );
1899
1900 error_exit:
1901         if ( result && entryp != NULL ) {
1902                 zfree( aio_workq_zonep, entryp );
1903                 entryp = NULL;
1904         }
1905
1906         return ( entryp );
1907 }
1908
1909
1910 /*
1911  * aio_queue_async_request - queue up an async IO request on our work queue then
1912  * wake up one of our worker threads to do the actual work.  We get a reference
1913  * to our caller's user land map in order to keep it around while we are
1914  * processing the request.
1915  */
1916 static int
1917 aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1918 {
1919         aio_workq_entry *entryp;
1920         int             result;
1921         int             old_count;
1922
1923         old_count = aio_increment_total_count();
1924         if (old_count >= aio_max_requests) {
1925                 result = EAGAIN;
1926                 goto error_noalloc;
1927         }
1928
1929         entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1930         if ( entryp == NULL ) {
1931                 result = EAGAIN;
1932                 goto error_noalloc;
1933         }
1934
1935
1936         aio_proc_lock_spin(procp);
1937
1938         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1939                 result = EAGAIN;
1940                 goto error_exit;
1941         }
1942
1943         /* check our aio limits to throttle bad or rude user land behavior */
1944         if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
1945                 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
1946                 result = EAGAIN;
1947                 goto error_exit;
1948         }
1949
1950         /* Add the IO to proc and work queues, wake up threads as appropriate */
1951         lck_mtx_convert_spin(aio_proc_mutex(procp));
1952         aio_enqueue_work(procp, entryp, 1);
1953
1954         aio_proc_unlock(procp);
1955
1956         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1957                           (int)procp, (int)aiocbp, 0, 0, 0 );
1958
1959         return( 0 );
1960
1961 error_exit:
1962         /*
1963          * This entry has not been queued up so no worries about
1964          * unlocked state and aio_map
1965          */
1966         aio_proc_unlock(procp);
1967         aio_free_request(entryp);
1968
1969 error_noalloc:
1970         aio_decrement_total_count();
1971
1972         return( result );
1973
1974 } /* aio_queue_async_request */
1975
1976
1977 /*
1978  * lio_create_entry
1979  *
1980  * Allocate an aio_workq_entry and fill it in.  If all goes well return 0
1981  * and pass the aio_workq_entry pointer back to our caller.
1982  *
1983  * Parameters:  procp                   The process makign the request
1984  *              aiocbp                  The aio context buffer pointer
1985  *              group_tag               The group tag used to indicate a
1986  *                                      group of operations has completed
1987  *              entrypp                 Pointer to the pointer to receive the
1988  *                                      address of the created aio_workq_entry
1989  *
1990  * Returns:     0                       Successfully created
1991  *              EAGAIN                  Try again (usually resource shortage)
1992  *
1993  *
1994  * Notes:       We get a reference to our caller's user land map in order
1995  *              to keep it around while we are processing the request.
1996  *
1997  *              lio_listio calls behave differently at completion they do
1998  *              completion notification when all async IO requests have
1999  *              completed.  We use group_tag to tag IO requests that behave
2000  *              in the delay notification manner.
2001  *
2002  *              All synchronous operations are considered to not have a
2003  *              signal routine associated with them (sigp == USER_ADDR_NULL).
2004  */
2005 static int
2006 lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2007                 aio_workq_entry **entrypp )
2008 {
2009         aio_workq_entry *entryp;
2010         int             result;
2011
2012         entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2013         if ( entryp == NULL ) {
2014                 result = EAGAIN;
2015                 goto error_exit;
2016         }
2017
2018         /*
2019          * Look for lio_listio LIO_NOP requests and ignore them; this is
2020          * not really an error, but we need to free our aio_workq_entry.
2021          */
2022         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
2023                 result = 0;
2024                 goto error_exit;
2025         }
2026
2027         *entrypp = entryp;
2028         return( 0 );
2029
2030 error_exit:
2031
2032         if ( entryp != NULL ) {
2033                 /*
2034                  * This entry has not been queued up so no worries about
2035                  * unlocked state and aio_map
2036                  */
2037                 aio_free_request(entryp);
2038         }
2039
2040         return( result );
2041
2042 } /* lio_create_entry */
2043
2044
2045 /*
2046  * aio_free_request - remove our reference on the user land map and
2047  * free the work queue entry resources.  The entry is off all lists
2048  * and has zero refcount, so no one can have a pointer to it.
2049  */
2050
2051 static int
2052 aio_free_request(aio_workq_entry *entryp)
2053 {
2054         /* remove our reference to the user land map. */
2055         if ( VM_MAP_NULL != entryp->aio_map) {
2056                 vm_map_deallocate(entryp->aio_map);
2057         }
2058
2059         entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2060
2061         zfree( aio_workq_zonep, entryp );
2062
2063         return( 0 );
2064
2065 } /* aio_free_request */
2066
2067
2068 /*
2069  * aio_validate
2070  *
2071  * validate the aiocb passed in by one of the aio syscalls.
2072  */
2073 static int
2074 aio_validate( aio_workq_entry *entryp )
2075 {
2076         struct fileproc                                 *fp;
2077         int                                                     flag;
2078         int                                                     result;
2079
2080         result = 0;
2081
2082         if ( (entryp->flags & AIO_LIO) != 0 ) {
2083                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
2084                         entryp->flags |= AIO_READ;
2085                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
2086                         entryp->flags |= AIO_WRITE;
2087                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
2088                         return( 0 );
2089                 else
2090                         return( EINVAL );
2091         }
2092
2093         flag = FREAD;
2094         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0 ) {
2095                 flag = FWRITE;
2096         }
2097
2098         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
2099                 if ( entryp->aiocb.aio_nbytes > INT_MAX         ||
2100                          entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2101                          entryp->aiocb.aio_offset < 0 )
2102                         return( EINVAL );
2103         }
2104
2105         /*
2106          * validate aiocb.aio_sigevent.  at this point we only support
2107          * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
2108          * sigev_value, sigev_notify_function, and sigev_notify_attributes
2109          * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
2110          * with no [RTS] (RalTime Signal) option group support.
2111          */
2112         switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
2113         case SIGEV_SIGNAL:
2114             {
2115                 int             signum;
2116
2117                 /* make sure we have a valid signal number */
2118                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
2119                 if ( signum <= 0 || signum >= NSIG ||
2120                          signum == SIGKILL || signum == SIGSTOP )
2121                         return (EINVAL);
2122             }
2123             break;
2124
2125         case SIGEV_NONE:
2126                 break;
2127
2128         case SIGEV_THREAD:
2129                 /* Unsupported [RTS] */
2130
2131         default:
2132                 return (EINVAL);
2133         }
2134
2135         /* validate the file descriptor and that the file was opened
2136          * for the appropriate read / write access.
2137          */
2138         proc_fdlock(entryp->procp);
2139
2140         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
2141         if ( result == 0 ) {
2142                 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
2143                         /* we don't have read or write access */
2144                         result = EBADF;
2145                 }
2146                 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
2147                         /* this is not a file */
2148                         result = ESPIPE;
2149                 } else
2150                         fp->f_flags |= FP_AIOISSUED;
2151
2152                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
2153         }
2154         else {
2155                 result = EBADF;
2156         }
2157
2158         proc_fdunlock(entryp->procp);
2159
2160         return( result );
2161
2162 } /* aio_validate */
2163
2164 static int
2165 aio_increment_total_count()
2166 {
2167         return OSIncrementAtomic(&aio_anchor.aio_total_count);
2168 }
2169
2170 static int
2171 aio_decrement_total_count()
2172 {
2173         int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2174         if (old <= 0) {
2175                 panic("Negative total AIO count!\n");
2176         }
2177
2178         return old;
2179 }
2180
2181 static int
2182 aio_get_process_count(proc_t procp )
2183 {
2184         return procp->p_aio_total_count;
2185
2186 } /* aio_get_process_count */
2187
2188 static int
2189 aio_get_all_queues_count( void )
2190 {
2191         return aio_anchor.aio_total_count;
2192
2193 } /* aio_get_all_queues_count */
2194
2195
2196 /*
2197  * do_aio_completion.  Handle async IO completion.
2198  */
2199 static void
2200 do_aio_completion( aio_workq_entry *entryp )
2201 {
2202
2203         boolean_t               lastLioCompleted = FALSE;
2204         aio_lio_context *lio_context = NULL;
2205         int waiter = 0;
2206
2207         lio_context = (aio_lio_context *)entryp->group_tag;
2208
2209         if (lio_context != NULL) {
2210
2211                 aio_proc_lock_spin(entryp->procp);
2212
2213                 /* Account for this I/O completing. */
2214                 lio_context->io_completed++;
2215
2216                 /* Are we done with this lio context? */
2217                 if (lio_context->io_issued == lio_context->io_completed) {
2218                         lastLioCompleted = TRUE;
2219                 }
2220
2221                 waiter = lio_context->io_waiter;
2222
2223                 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2224                 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2225                         /* wake up the waiter */
2226                         wakeup(lio_context);
2227                 }
2228
2229                 aio_proc_unlock(entryp->procp);
2230         }
2231
2232         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2233                  (entryp->flags & AIO_DISABLE) == 0 ) {
2234
2235                 boolean_t       performSignal = FALSE;
2236                  if (lio_context == NULL) {
2237                         performSignal = TRUE;
2238                  }
2239                  else {
2240                         /*
2241                          * If this was the last request in the group and a signal
2242                          * is desired, send one.
2243                          */
2244                         performSignal = lastLioCompleted;
2245                  }
2246
2247                  if (performSignal) {
2248
2249                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
2250                                  (int)entryp->procp, (int)entryp->uaiocbp,
2251                                  entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
2252
2253                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
2254                 }
2255         }
2256
2257         if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2258                 panic("Close and exit flags set at the same time\n");
2259         }
2260
2261         /*
2262          * need to handle case where a process is trying to exit, exec, or
2263          * close and is currently waiting for active aio requests to complete.
2264          * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2265          * other requests in the active queue for this process.  If there are
2266          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2267          * If there are some still active then do nothing - we only want to
2268          * wakeup when all active aio requests for the process are complete.
2269          *
2270          * Don't need to lock the entry or proc to check the cleanup flag.  It can only be
2271          * set for cancellation, while the entryp is still on a proc list; now it's
2272          * off, so that flag is already set if it's going to be.
2273          */
2274         if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) {
2275                 int             active_requests;
2276
2277                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2278                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2279
2280                 aio_proc_lock_spin(entryp->procp);
2281                 active_requests = aio_active_requests_for_process( entryp->procp );
2282                 if ( active_requests < 1 ) {
2283                         /*
2284                          * no active aio requests for this process, continue exiting.  In this
2285                          * case, there should be no one else waiting ont he proc in AIO...
2286                          */
2287                         wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2288                         aio_proc_unlock(entryp->procp);
2289
2290                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2291                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2292                 } else {
2293                         aio_proc_unlock(entryp->procp);
2294                 }
2295         }
2296
2297         if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) {
2298                 int             active_requests;
2299
2300                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2301                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2302
2303                 aio_proc_lock_spin(entryp->procp);
2304                 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2305                 if ( active_requests < 1 ) {
2306                         /* Can't wakeup_one(); multiple closes might be in progress. */
2307                         wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2308                         aio_proc_unlock(entryp->procp);
2309
2310                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2311                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2312                 } else {
2313                         aio_proc_unlock(entryp->procp);
2314                 }
2315         }
2316         /*
2317          * A thread in aio_suspend() wants to known about completed IOs.  If it checked
2318          * the done list before we moved our AIO there, then it already asserted its wait,
2319          * and we can wake it up without holding the lock.  If it checked the list after
2320          * we did our move, then it already has seen the AIO that we moved.  Herego, we
2321          * can do our wakeup without holding the lock.
2322          */
2323         wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
2324         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2325                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2326
2327         /*
2328          * free the LIO context if the last lio completed and no thread is
2329          * waiting
2330          */
2331         if (lastLioCompleted && (waiter == 0))
2332                 free_lio_context (lio_context);
2333
2334
2335 } /* do_aio_completion */
2336
2337
2338 /*
2339  * do_aio_read
2340  */
2341 static int
2342 do_aio_read( aio_workq_entry *entryp )
2343 {
2344         struct fileproc         *fp;
2345         int                                     error;
2346         struct vfs_context      context;
2347
2348         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2349                 return(error);
2350         if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2351                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2352                 return(EBADF);
2353         }
2354
2355         /*
2356          * <rdar://4714366>
2357          * Needs vfs_context_t from vfs_context_create() in entryp!
2358          */
2359         context.vc_thread = proc_thread(entryp->procp); /* XXX */
2360         context.vc_ucred = fp->f_fglob->fg_cred;
2361
2362         error = dofileread(&context, fp,
2363                                 entryp->aiocb.aio_buf,
2364                                 entryp->aiocb.aio_nbytes,
2365                                 entryp->aiocb.aio_offset, FOF_OFFSET,
2366                                 &entryp->returnval);
2367         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2368
2369         return( error );
2370
2371 } /* do_aio_read */
2372
2373
2374 /*
2375  * do_aio_write
2376  */
2377 static int
2378 do_aio_write( aio_workq_entry *entryp )
2379 {
2380         struct fileproc                 *fp;
2381         int                             error, flags;
2382         struct vfs_context              context;
2383
2384         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2385                 return(error);
2386         if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2387                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2388                 return(EBADF);
2389         }
2390
2391         flags = FOF_PCRED;
2392         if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) {
2393                 flags |= FOF_OFFSET;
2394         }
2395
2396         /*
2397          * <rdar://4714366>
2398          * Needs vfs_context_t from vfs_context_create() in entryp!
2399          */
2400         context.vc_thread = proc_thread(entryp->procp); /* XXX */
2401         context.vc_ucred = fp->f_fglob->fg_cred;
2402
2403         /* NB: tell dofilewrite the offset, and to use the proc cred */
2404         error = dofilewrite(&context,
2405                                 fp,
2406                                 entryp->aiocb.aio_buf,
2407                                 entryp->aiocb.aio_nbytes,
2408                                 entryp->aiocb.aio_offset,
2409                                 flags,
2410                                 &entryp->returnval);
2411
2412         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2413
2414         return( error );
2415
2416 } /* do_aio_write */
2417
2418
2419 /*
2420  * aio_active_requests_for_process - return number of active async IO
2421  * requests for the given process.
2422  */
2423 static int
2424 aio_active_requests_for_process(proc_t procp )
2425 {
2426         return( procp->p_aio_active_count );
2427
2428 } /* aio_active_requests_for_process */
2429
2430 /*
2431  * Called with the proc locked.
2432  */
2433 static int
2434 aio_proc_active_requests_for_file(proc_t procp, int fd)
2435 {
2436         int count = 0;
2437         aio_workq_entry *entryp;
2438         TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2439                 if (entryp->aiocb.aio_fildes == fd) {
2440                         count++;
2441                 }
2442         }
2443
2444         return count;
2445 } /* aio_active_requests_for_process */
2446
2447
2448
2449 /*
2450  * do_aio_fsync
2451  */
2452 static int
2453 do_aio_fsync( aio_workq_entry *entryp )
2454 {
2455         struct vfs_context      context;
2456         struct vnode            *vp;
2457         struct fileproc         *fp;
2458         int                     sync_flag;
2459         int                     error;
2460
2461         /*
2462          * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2463          *
2464          * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2465          * to mark for update the metadata not strictly necessary for data
2466          * retrieval, rather than forcing it to disk.
2467          *
2468          * If AIO_FSYNC is set, we have to also wait for metadata not really
2469          * necessary to data retrival are committed to stable storage (e.g.
2470          * atime, mtime, ctime, etc.).
2471          *
2472          * Metadata necessary for data retrieval ust be committed to stable
2473          * storage in either case (file length, etc.).
2474          */
2475         if (entryp->flags & AIO_FSYNC)
2476                 sync_flag = MNT_WAIT;
2477         else
2478                 sync_flag = MNT_DWAIT;
2479
2480         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2481         if ( error == 0 ) {
2482                 if ( (error = vnode_getwithref(vp)) ) {
2483                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2484                         entryp->returnval = -1;
2485                         return(error);
2486                 }
2487                 context.vc_thread = current_thread();
2488                 context.vc_ucred = fp->f_fglob->fg_cred;
2489
2490                 error = VNOP_FSYNC( vp, sync_flag, &context);
2491
2492                 (void)vnode_put(vp);
2493
2494                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2495         }
2496         if ( error != 0 )
2497                 entryp->returnval = -1;
2498
2499         return( error );
2500
2501 } /* do_aio_fsync */
2502
2503
2504 /*
2505  * is_already_queued - runs through our queues to see if the given
2506  * aiocbp / process is there.  Returns TRUE if there is a match
2507  * on any of our aio queues.
2508  *
2509  * Called with proc aio lock held (can be held spin)
2510  */
2511 static boolean_t
2512 is_already_queued(proc_t procp,
2513                                         user_addr_t aiocbp )
2514 {
2515         aio_workq_entry                 *entryp;
2516         boolean_t                               result;
2517
2518         result = FALSE;
2519
2520         /* look for matches on our queue of async IO requests that have completed */
2521         TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
2522                 if ( aiocbp == entryp->uaiocbp ) {
2523                         result = TRUE;
2524                         goto ExitThisRoutine;
2525                 }
2526         }
2527
2528         /* look for matches on our queue of active async IO requests */
2529         TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
2530                 if ( aiocbp == entryp->uaiocbp ) {
2531                         result = TRUE;
2532                         goto ExitThisRoutine;
2533                 }
2534         }
2535
2536 ExitThisRoutine:
2537         return( result );
2538
2539 } /* is_already_queued */
2540
2541
2542 static void
2543 free_lio_context(aio_lio_context* context)
2544 {
2545
2546 #if DEBUG
2547         OSDecrementAtomic(&lio_contexts_alloced);
2548 #endif /* DEBUG */
2549
2550         FREE( context, M_TEMP );
2551
2552 } /* free_lio_context */
2553
2554
2555 /*
2556  * aio initialization
2557  */
2558 __private_extern__ void
2559 aio_init( void )
2560 {
2561         int                     i;
2562
2563         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2564         aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2565         aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2566         aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
2567         aio_lock_attr = lck_attr_alloc_init();
2568
2569         lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2570         lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
2571
2572         aio_anchor.aio_inflight_count = 0;
2573         aio_anchor.aio_done_count = 0;
2574         aio_anchor.aio_total_count = 0;
2575         aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2576
2577         for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2578                 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2579         }
2580
2581
2582         i = sizeof( aio_workq_entry );
2583         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2584
2585         _aio_create_worker_threads( aio_worker_threads );
2586
2587 } /* aio_init */
2588
2589
2590 /*
2591  * aio worker threads created here.
2592  */
2593 __private_extern__ void
2594 _aio_create_worker_threads( int num )
2595 {
2596         int                     i;
2597
2598         /* create some worker threads to handle the async IO requests */
2599         for ( i = 0; i < num; i++ ) {
2600                 thread_t                myThread;
2601
2602                 if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
2603                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2604                 }
2605                 else
2606                         thread_deallocate(myThread);
2607         }
2608
2609         return;
2610
2611 } /* _aio_create_worker_threads */
2612
2613 /*
2614  * Return the current activation utask
2615  */
2616 task_t
2617 get_aiotask(void)
2618 {
2619         return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2620 }
2621
2622
2623 /*
2624  * In the case of an aiocb from a
2625  * 32-bit process we need to expand some longs and pointers to the correct
2626  * sizes in order to let downstream code always work on the same type of
2627  * aiocb (in our case that is a user_aiocb)
2628  */
2629 static void
2630 do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2631 {
2632         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2633         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2634         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2635         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2636         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2637         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2638
2639         /* special case here.  since we do not know if sigev_value is an */
2640         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2641         /* means if we send this info back to user space we need to remember */
2642         /* sigev_value was not expanded for the 32-bit case.  */
2643         /* NOTE - this does NOT affect us since we don't support sigev_value */
2644         /* yet in the aio context.  */
2645         //LP64
2646         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2647         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2648         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2649                 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2650         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2651                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2652         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2653                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2654 }
2655
2656 /* Similar for 64-bit user process, so that we don't need to satisfy
2657  * the alignment constraints of the original user64_aiocb
2658  */
2659 static void
2660 do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2661 {
2662         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2663         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2664         the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2665         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2666         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2667         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2668
2669         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2670         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2671         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2672                 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2673         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2674                 my_aiocbp->aio_sigevent.sigev_notify_function;
2675         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2676                 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2677 }