bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29
  30 /*
  31  * todo:
  32  *              1) ramesh is looking into how to replace taking a reference on
  33  *                      the user's map (vm_map_reference()) since it is believed that
  34  *                      would not hold the process for us.
  35  *              2) david is looking into a way for us to set the priority of the
  36  *                      worker threads to match that of the user's thread when the
  37  *                      async IO was queued.
  38  */
  39
  40
  41 /*
  42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  43  */
  44
  45 #include <sys/systm.h>
  46 #include <sys/fcntl.h>
  47 #include <sys/file_internal.h>
  48 #include <sys/filedesc.h>
  49 #include <sys/kernel.h>
  50 #include <sys/vnode_internal.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount_internal.h>
  53 #include <sys/param.h>
  54 #include <sys/proc_internal.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/unistd.h>
  57 #include <sys/user.h>
  58
  59 #include <sys/aio_kern.h>
  60 #include <sys/sysproto.h>
  61
  62 #include <machine/limits.h>
  63
  64 #include <mach/mach_types.h>
  65 #include <kern/kern_types.h>
  66 #include <kern/waitq.h>
  67 #include <kern/zalloc.h>
  68 #include <kern/task.h>
  69 #include <kern/sched_prim.h>
  70
  71 #include <vm/vm_map.h>
  72
  73 #include <libkern/OSAtomic.h>
  74
  75 #include <sys/kdebug.h>
  76 #define AIO_work_queued                                 1
  77 #define AIO_worker_wake                                 2
  78 #define AIO_completion_sig                              3
  79 #define AIO_completion_cleanup_wait             4
  80 #define AIO_completion_cleanup_wake             5
  81 #define AIO_completion_suspend_wake     6
  82 #define AIO_fsync_delay                                 7
  83 #define AIO_cancel                                              10
  84 #define AIO_cancel_async_workq                  11
  85 #define AIO_cancel_sync_workq                   12
  86 #define AIO_cancel_activeq                              13
  87 #define AIO_cancel_doneq                                14
  88 #define AIO_fsync                                               20
  89 #define AIO_read                                                30
  90 #define AIO_write                                               40
  91 #define AIO_listio                                              50
  92 #define AIO_error                                               60
  93 #define AIO_error_val                                   61
  94 #define AIO_error_activeq                               62
  95 #define AIO_error_workq                                 63
  96 #define AIO_return                                              70
  97 #define AIO_return_val                                  71
  98 #define AIO_return_activeq                              72
  99 #define AIO_return_workq                                73
 100 #define AIO_exec                                                80
 101 #define AIO_exit                                                90
 102 #define AIO_exit_sleep                                  91
 103 #define AIO_close                                               100
 104 #define AIO_close_sleep                                 101
 105 #define AIO_suspend                                             110
 106 #define AIO_suspend_sleep                               111
 107 #define AIO_worker_thread                               120
 108
 109 #if 0
 110 #undef KERNEL_DEBUG
 111 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 112 #endif
 113
 114 /*
 115  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 116  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 117  * (proc.aio_activeq) when one of our worker threads start the IO.
 118  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 119  * when the IO request completes.  The request remains on aio_doneq until
 120  * user process calls aio_return or the process exits, either way that is our
 121  * trigger to release aio resources.
 122  */
 123 typedef struct aio_workq   {
 124         TAILQ_HEAD(, aio_workq_entry)   aioq_entries;
 125         int                             aioq_count;
 126         lck_mtx_t                       aioq_mtx;
 127         struct waitq                    aioq_waitq;
 128 } *aio_workq_t;
 129
 130 #define AIO_NUM_WORK_QUEUES 1
 131 struct aio_anchor_cb
 132 {
 133         volatile int32_t        aio_inflight_count;     /* entries that have been taken from a workq */
 134         volatile int32_t        aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 135         volatile int32_t        aio_total_count;        /* total extant entries */
 136
 137         /* Hash table of queues here */
 138         int                     aio_num_workqs;
 139         struct aio_workq        aio_async_workqs[AIO_NUM_WORK_QUEUES];
 140 };
 141 typedef struct aio_anchor_cb aio_anchor_cb;
 142
 143 struct aio_lio_context
 144 {
 145         int             io_waiter;
 146         int             io_issued;
 147         int             io_completed;
 148 };
 149 typedef struct aio_lio_context aio_lio_context;
 150
 151
 152 /*
 153  * Notes on aio sleep / wake channels.
 154  * We currently pick a couple fields within the proc structure that will allow
 155  * us sleep channels that currently do not collide with any other kernel routines.
 156  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 157  */
 158 #define AIO_SUSPEND_SLEEP_CHAN  p_aio_active_count
 159 #define AIO_CLEANUP_SLEEP_CHAN  p_aio_total_count
 160
 161 #define ASSERT_AIO_FROM_PROC(aiop, theproc)     \
 162         if ((aiop)->procp != (theproc)) {       \
 163                 panic("AIO on a proc list that does not belong to that proc.\n"); \
 164         }
 165
 166 /*
 167  *  LOCAL PROTOTYPES
 168  */
 169 static void             aio_proc_lock(proc_t procp);
 170 static void             aio_proc_lock_spin(proc_t procp);
 171 static void             aio_proc_unlock(proc_t procp);
 172 static lck_mtx_t*       aio_proc_mutex(proc_t procp);
 173 static void             aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
 174 static void             aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
 175 static int              aio_get_process_count(proc_t procp );
 176 static int              aio_active_requests_for_process(proc_t procp );
 177 static int              aio_proc_active_requests_for_file(proc_t procp, int fd);
 178 static boolean_t        is_already_queued(proc_t procp, user_addr_t aiocbp );
 179 static boolean_t        should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
 180
 181 static void             aio_entry_lock(aio_workq_entry *entryp);
 182 static void             aio_entry_lock_spin(aio_workq_entry *entryp);
 183 static aio_workq_t      aio_entry_workq(aio_workq_entry *entryp);
 184 static lck_mtx_t*       aio_entry_mutex(__unused aio_workq_entry *entryp);
 185 static void             aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 186 static void             aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
 187 static void             aio_entry_ref_locked(aio_workq_entry *entryp);
 188 static void             aio_entry_unref_locked(aio_workq_entry *entryp);
 189 static void             aio_entry_ref(aio_workq_entry *entryp);
 190 static void             aio_entry_unref(aio_workq_entry *entryp);
 191 static void             aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
 192                                         int wait_for_completion, boolean_t disable_notification);
 193 static int              aio_entry_try_workq_remove(aio_workq_entry *entryp);
 194 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 195 static int              aio_free_request(aio_workq_entry *entryp);
 196
 197 static void             aio_workq_init(aio_workq_t wq);
 198 static void             aio_workq_lock_spin(aio_workq_t wq);
 199 static void             aio_workq_unlock(aio_workq_t wq);
 200 static lck_mtx_t*       aio_workq_mutex(aio_workq_t wq);
 201
 202 static void             aio_work_thread( void );
 203 static aio_workq_entry *aio_get_some_work( void );
 204
 205 static int              aio_get_all_queues_count( void );
 206 static int              aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
 207 static int              aio_validate( aio_workq_entry *entryp );
 208 static int              aio_increment_total_count(void);
 209 static int              aio_decrement_total_count(void);
 210
 211 static int              do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
 212 static void             do_aio_completion( aio_workq_entry *entryp );
 213 static int              do_aio_fsync( aio_workq_entry *entryp );
 214 static int              do_aio_read( aio_workq_entry *entryp );
 215 static int              do_aio_write( aio_workq_entry *entryp );
 216 static void             do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 217 static void             do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 218 static int      lio_create_entry(proc_t procp,
 219                                          user_addr_t aiocbp,
 220                                          void *group_tag,
 221                                          aio_workq_entry **entrypp );
 222 static aio_workq_entry *aio_create_queue_entry(proc_t procp,
 223                                         user_addr_t aiocbp,
 224                                         void *group_tag,
 225                                         int kindOfIO);
 226 static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
 227 static void             free_lio_context(aio_lio_context* context);
 228 static void             aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
 229
 230 #define ASSERT_AIO_PROC_LOCK_OWNED(p)   lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
 231 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q)  lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
 232 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e)  lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
 233
 234 /*
 235  *  EXTERNAL PROTOTYPES
 236  */
 237
 238 /* in ...bsd/kern/sys_generic.c */
 239 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
 240                         user_addr_t bufp, user_size_t nbyte,
 241                         off_t offset, int flags, user_ssize_t *retval );
 242 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 243                          user_addr_t bufp, user_size_t nbyte, off_t offset,
 244                          int flags, user_ssize_t *retval );
 245 #if DEBUG
 246 static uint32_t                         lio_contexts_alloced = 0;
 247 #endif  /* DEBUG */
 248
 249 /*
 250  * aio external global variables.
 251  */
 252 extern int aio_max_requests;                    /* AIO_MAX - configurable */
 253 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 254 extern int aio_worker_threads;                  /* AIO_THREAD_COUNT - configurable */
 255
 256
 257 /*
 258  * aio static variables.
 259  */
 260 static aio_anchor_cb    aio_anchor;
 261 static lck_grp_t        *aio_proc_lock_grp;
 262 static lck_grp_t        *aio_entry_lock_grp;
 263 static lck_grp_t        *aio_queue_lock_grp;
 264 static lck_attr_t       *aio_lock_attr;
 265 static lck_grp_attr_t   *aio_lock_grp_attr;
 266 static struct zone      *aio_workq_zonep;
 267 static lck_mtx_t        aio_entry_mtx;
 268 static lck_mtx_t        aio_proc_mtx;
 269
 270 static void
 271 aio_entry_lock(__unused aio_workq_entry *entryp)
 272 {
 273         lck_mtx_lock(&aio_entry_mtx);
 274 }
 275
 276 static void
 277 aio_entry_lock_spin(__unused aio_workq_entry *entryp)
 278 {
 279         lck_mtx_lock_spin(&aio_entry_mtx);
 280 }
 281
 282 static void
 283 aio_entry_unlock(__unused aio_workq_entry *entryp)
 284 {
 285         lck_mtx_unlock(&aio_entry_mtx);
 286 }
 287
 288 /* Hash */
 289 static aio_workq_t
 290 aio_entry_workq(__unused aio_workq_entry *entryp)
 291 {
 292         return &aio_anchor.aio_async_workqs[0];
 293 }
 294
 295 static lck_mtx_t*
 296 aio_entry_mutex(__unused aio_workq_entry *entryp)
 297 {
 298         return &aio_entry_mtx;
 299 }
 300
 301 static void
 302 aio_workq_init(aio_workq_t wq)
 303 {
 304         TAILQ_INIT(&wq->aioq_entries);
 305         wq->aioq_count = 0;
 306         lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
 307         waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
 308 }
 309
 310
 311 /*
 312  * Can be passed a queue which is locked spin.
 313  */
 314 static void
 315 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 316 {
 317         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 318
 319         if (entryp->aio_workq_link.tqe_prev == NULL) {
 320                 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
 321         }
 322
 323         TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
 324         queue->aioq_count--;
 325         entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
 326
 327         if (queue->aioq_count  < 0) {
 328                 panic("Negative count on a queue.\n");
 329         }
 330 }
 331
 332 static void
 333 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
 334 {
 335         ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
 336
 337         TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
 338         if (queue->aioq_count  < 0) {
 339                 panic("Negative count on a queue.\n");
 340         }
 341         queue->aioq_count++;
 342 }
 343
 344 static void
 345 aio_proc_lock(proc_t procp)
 346 {
 347         lck_mtx_lock(aio_proc_mutex(procp));
 348 }
 349
 350 static void
 351 aio_proc_lock_spin(proc_t procp)
 352 {
 353         lck_mtx_lock_spin(aio_proc_mutex(procp));
 354 }
 355
 356 static void
 357 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
 358 {
 359         ASSERT_AIO_PROC_LOCK_OWNED(procp);
 360
 361         TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
 362         TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
 363         procp->p_aio_active_count--;
 364         OSIncrementAtomic(&aio_anchor.aio_done_count);
 365 }
 366
 367 static void
 368 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
 369 {
 370         TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
 371         OSDecrementAtomic(&aio_anchor.aio_done_count);
 372         aio_decrement_total_count();
 373         procp->p_aio_total_count--;
 374 }
 375
 376 static void
 377 aio_proc_unlock(proc_t procp)
 378 {
 379         lck_mtx_unlock(aio_proc_mutex(procp));
 380 }
 381
 382 static lck_mtx_t*
 383 aio_proc_mutex(proc_t procp)
 384 {
 385         return &procp->p_mlock;
 386 }
 387
 388 static void
 389 aio_entry_ref_locked(aio_workq_entry *entryp)
 390 {
 391         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 392
 393         if (entryp->aio_refcount < 0) {
 394                 panic("AIO workq entry with a negative refcount.\n");
 395         }
 396         entryp->aio_refcount++;
 397 }
 398
 399
 400 /* Return 1 if you've freed it */
 401 static void
 402 aio_entry_unref_locked(aio_workq_entry *entryp)
 403 {
 404         ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
 405
 406         entryp->aio_refcount--;
 407         if (entryp->aio_refcount < 0) {
 408                 panic("AIO workq entry with a negative refcount.\n");
 409         }
 410 }
 411
 412 static void
 413 aio_entry_ref(aio_workq_entry *entryp)
 414 {
 415         aio_entry_lock_spin(entryp);
 416         aio_entry_ref_locked(entryp);
 417         aio_entry_unlock(entryp);
 418 }
 419 static void
 420 aio_entry_unref(aio_workq_entry *entryp)
 421 {
 422         aio_entry_lock_spin(entryp);
 423         aio_entry_unref_locked(entryp);
 424
 425         if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
 426                 aio_entry_unlock(entryp);
 427                 aio_free_request(entryp);
 428         } else {
 429                 aio_entry_unlock(entryp);
 430         }
 431
 432         return;
 433 }
 434
 435 static void
 436 aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
 437 {
 438         aio_entry_lock_spin(entryp);
 439
 440         if (cancelled) {
 441                 aio_entry_ref_locked(entryp);
 442                 entryp->errorval = ECANCELED;
 443                 entryp->returnval = -1;
 444         }
 445
 446         if ( wait_for_completion ) {
 447                 entryp->flags |= wait_for_completion; /* flag for special completion processing */
 448         }
 449
 450         if ( disable_notification ) {
 451                 entryp->flags |= AIO_DISABLE; /* Don't want a signal */
 452         }
 453
 454         aio_entry_unlock(entryp);
 455 }
 456
 457 static int
 458 aio_entry_try_workq_remove(aio_workq_entry *entryp)
 459 {
 460         /* Can only be cancelled if it's still on a work queue */
 461         if (entryp->aio_workq_link.tqe_prev != NULL) {
 462                 aio_workq_t queue;
 463
 464                 /* Will have to check again under the lock */
 465                 queue = aio_entry_workq(entryp);
 466                 aio_workq_lock_spin(queue);
 467                 if (entryp->aio_workq_link.tqe_prev != NULL) {
 468                         aio_workq_remove_entry_locked(queue, entryp);
 469                         aio_workq_unlock(queue);
 470                         return 1;
 471                 }  else {
 472                         aio_workq_unlock(queue);
 473                 }
 474         }
 475
 476         return 0;
 477 }
 478
 479 static void
 480 aio_workq_lock_spin(aio_workq_t wq)
 481 {
 482         lck_mtx_lock_spin(aio_workq_mutex(wq));
 483 }
 484
 485 static void
 486 aio_workq_unlock(aio_workq_t wq)
 487 {
 488         lck_mtx_unlock(aio_workq_mutex(wq));
 489 }
 490
 491 static lck_mtx_t*
 492 aio_workq_mutex(aio_workq_t wq)
 493 {
 494         return &wq->aioq_mtx;
 495 }
 496
 497 /*
 498  * aio_cancel - attempt to cancel one or more async IO requests currently
 499  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 500  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 501  * is NULL then all outstanding async IO request for the given file
 502  * descriptor are cancelled (if possible).
 503  */
 504 int
 505 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
 506 {
 507         struct user_aiocb               my_aiocb;
 508         int                                                     result;
 509
 510         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 511                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 512
 513         /* quick check to see if there are any async IO requests queued up */
 514         if (aio_get_all_queues_count() < 1) {
 515                 result = 0;
 516                 *retval = AIO_ALLDONE;
 517                 goto ExitRoutine;
 518         }
 519
 520         *retval = -1;
 521         if ( uap->aiocbp != USER_ADDR_NULL ) {
 522                 if ( proc_is64bit(p) ) {
 523                         struct user64_aiocb aiocb64;
 524
 525                         result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
 526                         if (result == 0 )
 527                                 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
 528
 529                 } else {
 530                         struct user32_aiocb aiocb32;
 531
 532                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
 533                         if ( result == 0 )
 534                                 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
 535                 }
 536
 537                 if ( result != 0 ) {
 538                         result = EAGAIN;
 539                         goto ExitRoutine;
 540                 }
 541
 542                 /* NOTE - POSIX standard says a mismatch between the file */
 543                 /* descriptor passed in and the file descriptor embedded in */
 544                 /* the aiocb causes unspecified results.  We return EBADF in */
 545                 /* that situation.  */
 546                 if ( uap->fd != my_aiocb.aio_fildes ) {
 547                         result = EBADF;
 548                         goto ExitRoutine;
 549                 }
 550         }
 551
 552         aio_proc_lock(p);
 553         result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
 554         ASSERT_AIO_PROC_LOCK_OWNED(p);
 555         aio_proc_unlock(p);
 556
 557         if ( result != -1 ) {
 558                 *retval = result;
 559                 result = 0;
 560                 goto ExitRoutine;
 561         }
 562
 563         result = EBADF;
 564
 565 ExitRoutine:
 566         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 567                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 568
 569         return( result );
 570
 571 } /* aio_cancel */
 572
 573
 574 /*
 575  * _aio_close - internal function used to clean up async IO requests for
 576  * a file descriptor that is closing.
 577  * THIS MAY BLOCK.
 578  */
 579 __private_extern__ void
 580 _aio_close(proc_t p, int fd )
 581 {
 582         int                     error;
 583
 584         /* quick check to see if there are any async IO requests queued up */
 585         if (aio_get_all_queues_count() < 1) {
 586                 return;
 587         }
 588
 589         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 590                           (int)p, fd, 0, 0, 0 );
 591
 592         /* cancel all async IO requests on our todo queues for this file descriptor */
 593         aio_proc_lock(p);
 594         error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
 595         ASSERT_AIO_PROC_LOCK_OWNED(p);
 596         if ( error == AIO_NOTCANCELED ) {
 597                 /*
 598                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 599                  * and file descriptor on the active async IO queue.  Active requests cannot
 600                  * be cancelled so we must wait for them to complete.  We will get a special
 601                  * wake up call on our channel used to sleep for ALL active requests to
 602                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 603                  * when we must wait for all active aio requests.
 604                  */
 605
 606                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 607                                   (int)p, fd, 0, 0, 0 );
 608
 609                 while (aio_proc_active_requests_for_file(p, fd) > 0) {
 610                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 );
 611                 }
 612
 613         }
 614
 615         aio_proc_unlock(p);
 616
 617         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 618                           (int)p, fd, 0, 0, 0 );
 619
 620         return;
 621
 622 } /* _aio_close */
 623
 624
 625 /*
 626  * aio_error - return the error status associated with the async IO
 627  * request referred to by uap->aiocbp.  The error status is the errno
 628  * value that would be set by the corresponding IO request (read, wrtie,
 629  * fdatasync, or sync).
 630  */
 631 int
 632 aio_error(proc_t p, struct aio_error_args *uap, int *retval )
 633 {
 634         aio_workq_entry                         *entryp;
 635         int                                                     error;
 636
 637         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 638                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 639
 640         /* see if there are any aios to check */
 641         if (aio_get_all_queues_count() < 1) {
 642                 return EINVAL;
 643         }
 644
 645         aio_proc_lock(p);
 646
 647         /* look for a match on our queue of async IO requests that have completed */
 648         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 649                 if ( entryp->uaiocbp == uap->aiocbp ) {
 650                         ASSERT_AIO_FROM_PROC(entryp, p);
 651
 652                         aio_entry_lock_spin(entryp);
 653                         *retval = entryp->errorval;
 654                         error = 0;
 655                         aio_entry_unlock(entryp);
 656                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 657                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 658                         goto ExitRoutine;
 659                 }
 660         }
 661
 662         /* look for a match on our queue of active async IO requests */
 663         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 664                 if ( entryp->uaiocbp == uap->aiocbp ) {
 665                         ASSERT_AIO_FROM_PROC(entryp, p);
 666                         *retval = EINPROGRESS;
 667                         error = 0;
 668                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 669                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 670                         goto ExitRoutine;
 671                 }
 672         }
 673
 674         error = EINVAL;
 675
 676 ExitRoutine:
 677         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 678                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 679         aio_proc_unlock(p);
 680
 681         return( error );
 682
 683 } /* aio_error */
 684
 685
 686 /*
 687  * aio_fsync - asynchronously force all IO operations associated
 688  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 689  * queued at the time of the call to the synchronized completion state.
 690  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 691  * fdatasync() call.
 692  */
 693 int
 694 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
 695 {
 696         int                     error;
 697         int                     fsync_kind;
 698
 699         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 700                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 701
 702         *retval = 0;
 703         /* 0 := O_SYNC for binary backward compatibility with Panther */
 704         if (uap->op == O_SYNC || uap->op == 0)
 705                 fsync_kind = AIO_FSYNC;
 706         else if ( uap->op == O_DSYNC )
 707                 fsync_kind = AIO_DSYNC;
 708         else {
 709                 *retval = -1;
 710                 error = EINVAL;
 711                 goto ExitRoutine;
 712         }
 713
 714         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 715         if ( error != 0 )
 716                 *retval = -1;
 717
 718 ExitRoutine:
 719         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 720                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 721
 722         return( error );
 723
 724 } /* aio_fsync */
 725
 726
 727 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 728  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 729  * (uap->aiocbp->aio_buf).
 730  */
 731 int
 732 aio_read(proc_t p, struct aio_read_args *uap, int *retval )
 733 {
 734         int                     error;
 735
 736         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 737                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 738
 739         *retval = 0;
 740
 741         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 742         if ( error != 0 )
 743                 *retval = -1;
 744
 745         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 746                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 747
 748         return( error );
 749
 750 } /* aio_read */
 751
 752
 753 /*
 754  * aio_return - return the return status associated with the async IO
 755  * request referred to by uap->aiocbp.  The return status is the value
 756  * that would be returned by corresponding IO request (read, write,
 757  * fdatasync, or sync).  This is where we release kernel resources
 758  * held for async IO call associated with the given aiocb pointer.
 759  */
 760 int
 761 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
 762 {
 763         aio_workq_entry                         *entryp;
 764         int                                                     error;
 765         boolean_t                                       proc_lock_held = FALSE;
 766
 767         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 768                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 769
 770         /* See if there are any entries to check */
 771         if (aio_get_all_queues_count() < 1) {
 772                 error = EINVAL;
 773                 goto ExitRoutine;
 774         }
 775
 776         aio_proc_lock(p);
 777         proc_lock_held = TRUE;
 778         *retval = 0;
 779
 780         /* look for a match on our queue of async IO requests that have completed */
 781         TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
 782                 ASSERT_AIO_FROM_PROC(entryp, p);
 783                 if ( entryp->uaiocbp == uap->aiocbp ) {
 784                         /* Done and valid for aio_return(), pull it off the list */
 785                         aio_proc_remove_done_locked(p, entryp);
 786
 787                         /* Drop the proc lock, but keep the entry locked */
 788                         aio_entry_lock(entryp);
 789                         aio_proc_unlock(p);
 790                         proc_lock_held = FALSE;
 791
 792                         *retval = entryp->returnval;
 793                         error = 0;
 794
 795                         /* No references and off all lists, safe to free */
 796                         if (entryp->aio_refcount == 0) {
 797                                 aio_entry_unlock(entryp);
 798                                 aio_free_request(entryp);
 799                         }
 800                         else {
 801                                 /* Whoever has the refcount will have to free it */
 802                                 entryp->flags |= AIO_DO_FREE;
 803                                 aio_entry_unlock(entryp);
 804                         }
 805
 806
 807                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 808                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 809                         goto ExitRoutine;
 810                 }
 811         }
 812
 813         /* look for a match on our queue of active async IO requests */
 814         TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
 815                 ASSERT_AIO_FROM_PROC(entryp, p);
 816                 if ( entryp->uaiocbp == uap->aiocbp ) {
 817                         error = EINPROGRESS;
 818                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 819                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 820                         goto ExitRoutine;
 821                 }
 822         }
 823
 824         error = EINVAL;
 825
 826 ExitRoutine:
 827         if (proc_lock_held)
 828                 aio_proc_unlock(p);
 829         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 830                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 831
 832         return( error );
 833
 834 } /* aio_return */
 835
 836
 837 /*
 838  * _aio_exec - internal function used to clean up async IO requests for
 839  * a process that is going away due to exec().  We cancel any async IOs
 840  * we can and wait for those already active.  We also disable signaling
 841  * for cancelled or active aio requests that complete.
 842  * This routine MAY block!
 843  */
 844 __private_extern__ void
 845 _aio_exec(proc_t p )
 846 {
 847
 848         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 849                           (int)p, 0, 0, 0, 0 );
 850
 851         _aio_exit( p );
 852
 853         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 854                           (int)p, 0, 0, 0, 0 );
 855
 856         return;
 857
 858 } /* _aio_exec */
 859
 860
 861 /*
 862  * _aio_exit - internal function used to clean up async IO requests for
 863  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 864  * we can and wait for those already active.  We also disable signaling
 865  * for cancelled or active aio requests that complete.  This routine MAY block!
 866  */
 867 __private_extern__ void
 868 _aio_exit(proc_t p )
 869 {
 870         int                                             error;
 871         aio_workq_entry                 *entryp;
 872
 873
 874         /* quick check to see if there are any async IO requests queued up */
 875         if (aio_get_all_queues_count() < 1) {
 876                 return;
 877         }
 878
 879         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 880                           (int)p, 0, 0, 0, 0 );
 881
 882         aio_proc_lock(p);
 883
 884         /*
 885          * cancel async IO requests on the todo work queue and wait for those
 886          * already active to complete.
 887          */
 888         error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
 889         ASSERT_AIO_PROC_LOCK_OWNED(p);
 890         if ( error == AIO_NOTCANCELED ) {
 891                 /*
 892                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 893                  * on the active async IO queue.  Active requests cannot be cancelled so we
 894                  * must wait for them to complete.  We will get a special wake up call on
 895                  * our channel used to sleep for ALL active requests to complete.  This sleep
 896                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 897                  * active aio requests.
 898                  */
 899
 900                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 901                                   (int)p, 0, 0, 0, 0 );
 902
 903                 while (p->p_aio_active_count != 0) {
 904                         msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
 905                 }
 906         }
 907
 908         if (p->p_aio_active_count != 0) {
 909                 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
 910         }
 911
 912         /* release all aio resources used by this process */
 913         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 914         while ( entryp != NULL ) {
 915                 ASSERT_AIO_FROM_PROC(entryp, p);
 916                 aio_workq_entry                 *next_entryp;
 917
 918                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 919                 aio_proc_remove_done_locked(p, entryp);
 920
 921                 /* we cannot free requests that are still completing */
 922                 aio_entry_lock_spin(entryp);
 923                 if (entryp->aio_refcount == 0) {
 924                         aio_proc_unlock(p);
 925                         aio_entry_unlock(entryp);
 926                         aio_free_request(entryp);
 927
 928                         /* need to start over since aio_doneq may have been */
 929                         /* changed while we were away.  */
 930                         aio_proc_lock(p);
 931                         entryp = TAILQ_FIRST( &p->p_aio_doneq );
 932                         continue;
 933                 }
 934                 else {
 935                         /* whoever has the reference will have to do the free */
 936                         entryp->flags |= AIO_DO_FREE;
 937                 }
 938
 939                 aio_entry_unlock(entryp);
 940                 entryp = next_entryp;
 941         }
 942
 943         aio_proc_unlock(p);
 944
 945         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 946                           (int)p, 0, 0, 0, 0 );
 947         return;
 948
 949 } /* _aio_exit */
 950
 951
 952 static boolean_t
 953 should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
 954 {
 955         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 956                         (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 957                         (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 958                 return TRUE;
 959         }
 960
 961         return FALSE;
 962 }
 963
 964 /*
 965  * do_aio_cancel_locked - cancel async IO requests (if possible).  We get called by
 966  * aio_cancel, close, and at exit.
 967  * There are three modes of operation: 1) cancel all async IOs for a process -
 968  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 969  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 970  * aiocbp.
 971  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 972  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 973  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 974  * were already complete.
 975  * WARNING - do not deference aiocbp in this routine, it may point to user
 976  * land data that has not been copied in (when called from aio_cancel() )
 977  *
 978  * Called with proc locked, and returns the same way.
 979  */
 980 static int
 981 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
 982         int wait_for_completion, boolean_t disable_notification )
 983 {
 984         ASSERT_AIO_PROC_LOCK_OWNED(p);
 985
 986         aio_workq_entry                 *entryp;
 987         int                                             result;
 988
 989         result = -1;
 990
 991         /* look for a match on our queue of async todo work. */
 992         entryp = TAILQ_FIRST(&p->p_aio_activeq);
 993         while ( entryp != NULL ) {
 994                 ASSERT_AIO_FROM_PROC(entryp, p);
 995                 aio_workq_entry                 *next_entryp;
 996
 997                 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
 998                 if (!should_cancel(entryp, aiocbp, fd)) {
 999                         entryp = next_entryp;
1000                         continue;
1001                 }
1002
1003                 /* Can only be cancelled if it's still on a work queue */
1004                 if (aio_entry_try_workq_remove(entryp) != 0) {
1005                         /* Have removed from workq. Update entry state and take a ref */
1006                         aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
1007
1008                         /* Put on the proc done queue and update counts, then unlock the proc */
1009                         aio_proc_move_done_locked(p, entryp);
1010                         aio_proc_unlock(p);
1011
1012                         /* Now it's officially cancelled.  Do the completion */
1013                         result = AIO_CANCELED;
1014                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1015                                         (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1016                         do_aio_completion(entryp);
1017
1018                         /* This will free if the aio_return() has already happened ... */
1019                         aio_entry_unref(entryp);
1020                         aio_proc_lock(p);
1021
1022                         if ( aiocbp != USER_ADDR_NULL ) {
1023                                 return( result );
1024                         }
1025
1026                         /*
1027                          * Restart from the head of the proc active queue since it
1028                          * may have been changed while we were away doing completion
1029                          * processing.
1030                          *
1031                          * Note that if we found an uncancellable AIO before, we will
1032                          * either find it again or discover that it's been completed,
1033                          * so resetting the result will not cause us to return success
1034                          * despite outstanding AIOs.
1035                          */
1036                         entryp = TAILQ_FIRST(&p->p_aio_activeq);
1037                         result = -1; /* As if beginning anew */
1038                 } else {
1039                         /*
1040                          * It's been taken off the active queue already, i.e. is in flight.
1041                          * All we can do is ask for notification.
1042                          */
1043                         result = AIO_NOTCANCELED;
1044
1045                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
1046                                         (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1047
1048                         /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1049                         aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
1050
1051                         if ( aiocbp != USER_ADDR_NULL ) {
1052                                 return( result );
1053                         }
1054                         entryp = next_entryp;
1055                 }
1056         } /* while... */
1057
1058         /*
1059          * if we didn't find any matches on the todo or active queues then look for a
1060          * match on our queue of async IO requests that have completed and if found
1061          * return AIO_ALLDONE result.
1062          *
1063          * Proc AIO lock is still held.
1064          */
1065         if ( result == -1 ) {
1066                 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1067                         ASSERT_AIO_FROM_PROC(entryp, p);
1068                         if (should_cancel(entryp, aiocbp, fd)) {
1069                                 result = AIO_ALLDONE;
1070                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
1071                                                 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1072
1073                                 if ( aiocbp != USER_ADDR_NULL ) {
1074                                         return( result );
1075                                 }
1076                         }
1077                 }
1078         }
1079
1080         return( result );
1081
1082 }
1083  /* do_aio_cancel_locked */
1084
1085
1086 /*
1087  * aio_suspend - suspend the calling thread until at least one of the async
1088  * IO operations referenced by uap->aiocblist has completed, until a signal
1089  * interrupts the function, or uap->timeoutp time interval (optional) has
1090  * passed.
1091  * Returns 0 if one or more async IOs have completed else -1 and errno is
1092  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1093  * woke us up.
1094  */
1095 int
1096 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1097 {
1098         __pthread_testcancel(1);
1099         return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
1100 }
1101
1102
1103 int
1104 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
1105 {
1106         int                                     error;
1107         int                                     i, count;
1108         uint64_t                        abstime;
1109         struct user_timespec ts;
1110         aio_workq_entry         *entryp;
1111         user_addr_t                     *aiocbpp;
1112
1113         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1114                           (int)p, uap->nent, 0, 0, 0 );
1115
1116         *retval = -1;
1117         abstime = 0;
1118         aiocbpp = NULL;
1119
1120         count = aio_get_all_queues_count( );
1121         if ( count < 1 ) {
1122                 error = EINVAL;
1123                 goto ExitThisRoutine;
1124         }
1125
1126         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
1127                 error = EINVAL;
1128                 goto ExitThisRoutine;
1129         }
1130
1131         if ( uap->timeoutp != USER_ADDR_NULL ) {
1132                 if ( proc_is64bit(p) ) {
1133                         struct user64_timespec temp;
1134                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1135                         if ( error == 0 ) {
1136                                 ts.tv_sec = temp.tv_sec;
1137                                 ts.tv_nsec = temp.tv_nsec;
1138                         }
1139                 }
1140                 else {
1141                         struct user32_timespec temp;
1142                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1143                         if ( error == 0 ) {
1144                                 ts.tv_sec = temp.tv_sec;
1145                                 ts.tv_nsec = temp.tv_nsec;
1146                         }
1147                 }
1148                 if ( error != 0 ) {
1149                         error = EAGAIN;
1150                         goto ExitThisRoutine;
1151                 }
1152
1153                 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
1154                         error = EINVAL;
1155                         goto ExitThisRoutine;
1156                 }
1157
1158                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1159                                                                          &abstime );
1160                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
1161         }
1162
1163         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1164         if ( aiocbpp == NULL ) {
1165                 error = EAGAIN;
1166                 goto ExitThisRoutine;
1167         }
1168
1169         /* check list of aio requests to see if any have completed */
1170 check_for_our_aiocbp:
1171         aio_proc_lock_spin(p);
1172         for ( i = 0; i < uap->nent; i++ ) {
1173                 user_addr_t     aiocbp;
1174
1175                 /* NULL elements are legal so check for 'em */
1176                 aiocbp = *(aiocbpp + i);
1177                 if ( aiocbp == USER_ADDR_NULL )
1178                         continue;
1179
1180                 /* return immediately if any aio request in the list is done */
1181                 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1182                         ASSERT_AIO_FROM_PROC(entryp, p);
1183                         if ( entryp->uaiocbp == aiocbp ) {
1184                                 aio_proc_unlock(p);
1185                                 *retval = 0;
1186                                 error = 0;
1187                                 goto ExitThisRoutine;
1188                         }
1189                 }
1190         } /* for ( ; i < uap->nent; ) */
1191
1192         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1193                           (int)p, uap->nent, 0, 0, 0 );
1194
1195         /*
1196          * wait for an async IO to complete or a signal fires or timeout expires.
1197          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1198          * interrupts us.  If an async IO completes before a signal fires or our
1199          * timeout expires, we get a wakeup call from aio_work_thread().
1200          */
1201
1202         error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
1203         if ( error == 0 ) {
1204                 /*
1205                  * got our wakeup call from aio_work_thread().
1206                  * Since we can get a wakeup on this channel from another thread in the
1207                  * same process we head back up to make sure this is for the correct aiocbp.
1208                  * If it is the correct aiocbp we will return from where we do the check
1209                  * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1210                  * else we will fall out and just sleep again.
1211                  */
1212                 goto check_for_our_aiocbp;
1213         }
1214         else if ( error == EWOULDBLOCK ) {
1215                 /* our timeout expired */
1216                 error = EAGAIN;
1217         }
1218         else {
1219                 /* we were interrupted */
1220                 error = EINTR;
1221         }
1222
1223 ExitThisRoutine:
1224         if ( aiocbpp != NULL )
1225                 FREE( aiocbpp, M_TEMP );
1226
1227         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1228                           (int)p, uap->nent, error, 0, 0 );
1229
1230         return( error );
1231
1232 } /* aio_suspend */
1233
1234
1235 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1236  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1237  * (uap->aiocbp->aio_buf).
1238  */
1239
1240 int
1241 aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1242 {
1243         int                     error;
1244
1245         *retval = 0;
1246
1247         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1248                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1249
1250         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1251         if ( error != 0 )
1252                 *retval = -1;
1253
1254         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1255                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1256
1257         return( error );
1258
1259 } /* aio_write */
1260
1261
1262 static user_addr_t *
1263 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
1264 {
1265         user_addr_t     *aiocbpp;
1266         int             i, result;
1267
1268         /* we reserve enough space for largest possible pointer size */
1269         MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1270         if ( aiocbpp == NULL )
1271                 goto err;
1272
1273         /* copyin our aiocb pointers from list */
1274         result = copyin( aiocblist, aiocbpp,
1275                         proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1276                                             : (nent * sizeof(user32_addr_t)) );
1277         if ( result) {
1278                 FREE( aiocbpp, M_TEMP );
1279                 aiocbpp = NULL;
1280                 goto err;
1281         }
1282
1283         /*
1284          * We depend on a list of user_addr_t's so we need to
1285          * munge and expand when these pointers came from a
1286          * 32-bit process
1287          */
1288         if ( !proc_is64bit(procp) ) {
1289                 /* copy from last to first to deal with overlap */
1290                 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1291                 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1292
1293                 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1294                         *my_addrp = (user_addr_t) (*my_ptrp);
1295                 }
1296         }
1297
1298 err:
1299         return (aiocbpp);
1300 }
1301
1302
1303 static int
1304 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1305 {
1306         int     result = 0;
1307
1308         if (sigp == USER_ADDR_NULL)
1309                 goto out;
1310
1311         /*
1312          * We need to munge aio_sigevent since it contains pointers.
1313          * Since we do not know if sigev_value is an int or a ptr we do
1314          * NOT cast the ptr to a user_addr_t.   This means if we send
1315          * this info back to user space we need to remember sigev_value
1316          * was not expanded for the 32-bit case.
1317          *
1318          * Notes:        This does NOT affect us since we don't support
1319          *              sigev_value yet in the aio context.
1320          */
1321         if ( proc_is64bit(procp) ) {
1322                 struct user64_sigevent sigevent64;
1323
1324                 result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
1325                 if ( result == 0 ) {
1326                         sigev->sigev_notify = sigevent64.sigev_notify;
1327                         sigev->sigev_signo = sigevent64.sigev_signo;
1328                         sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1329                         sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1330                         sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1331                 }
1332
1333         } else {
1334                 struct user32_sigevent sigevent32;
1335
1336                 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1337                 if ( result == 0 ) {
1338                         sigev->sigev_notify = sigevent32.sigev_notify;
1339                         sigev->sigev_signo = sigevent32.sigev_signo;
1340                         sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1341                         sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1342                         sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1343                 }
1344         }
1345
1346         if ( result != 0 ) {
1347                 result = EAGAIN;
1348         }
1349
1350 out:
1351         return (result);
1352 }
1353
1354 /*
1355  * aio_enqueue_work
1356  *
1357  * Queue up the entry on the aio asynchronous work queue in priority order
1358  * based on the relative priority of the request.  We calculate the relative
1359  * priority using the nice value of the caller and the value
1360  *
1361  * Parameters:  procp                   Process queueing the I/O
1362  *              entryp                  The work queue entry being queued
1363  *
1364  * Returns:     (void)                  No failure modes
1365  *
1366  * Notes:       This function is used for both lio_listio and aio
1367  *
1368  * XXX:         At some point, we may have to consider thread priority
1369  *              rather than process priority, but we don't maintain the
1370  *              adjusted priority for threads the POSIX way.
1371  *
1372  *
1373  * Called with proc locked.
1374  */
1375 static void
1376 aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1377 {
1378 #if 0
1379         aio_workq_entry *my_entryp;     /* used for insertion sort */
1380 #endif /* 0 */
1381         aio_workq_t queue = aio_entry_workq(entryp);
1382
1383         if (proc_locked == 0) {
1384                 aio_proc_lock(procp);
1385         }
1386
1387         ASSERT_AIO_PROC_LOCK_OWNED(procp);
1388
1389         /* Onto proc queue */
1390         TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp,  aio_proc_link);
1391         procp->p_aio_active_count++;
1392         procp->p_aio_total_count++;
1393
1394         /* And work queue */
1395         aio_workq_lock_spin(queue);
1396         aio_workq_add_entry_locked(queue, entryp);
1397         waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1398                            THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
1399         aio_workq_unlock(queue);
1400
1401         if (proc_locked == 0) {
1402                 aio_proc_unlock(procp);
1403         }
1404
1405 #if 0
1406         /*
1407          * Procedure:
1408          *
1409          * (1)  The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1410          * (2)  The normalized nice value is in the range 0..((2 * NZERO) - 1)
1411          *      which is [0..39], with 0 not being used.  In nice values, the
1412          *      lower the nice value, the higher the priority.
1413          * (3)  The normalized scheduling prioritiy is the highest nice value
1414          *      minus the current nice value.  In I/O scheduling priority, the
1415          *      higher the value the lower the priority, so it is the inverse
1416          *      of the nice value (the higher the number, the higher the I/O
1417          *      priority).
1418          * (4)  From the normalized scheduling priority, we subtract the
1419          *      request priority to get the request priority value number;
1420          *      this means that requests are only capable of depressing their
1421          *      priority relative to other requests,
1422          */
1423         entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1424
1425         /* only premit depressing the priority */
1426         if (entryp->aiocb.aio_reqprio < 0)
1427                 entryp->aiocb.aio_reqprio = 0;
1428         if (entryp->aiocb.aio_reqprio > 0) {
1429                 entryp->priority -= entryp->aiocb.aio_reqprio;
1430                 if (entryp->priority < 0)
1431                         entryp->priority = 0;
1432         }
1433
1434         /* Insertion sort the entry; lowest ->priority to highest */
1435         TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1436                 if ( entryp->priority <= my_entryp->priority) {
1437                         TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1438                         break;
1439                 }
1440         }
1441         if (my_entryp == NULL)
1442                 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1443 #endif /* 0 */
1444 }
1445
1446
1447 /*
1448  * lio_listio - initiate a list of IO requests.  We process the list of
1449  * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1450  * (mode == LIO_NOWAIT).
1451  *
1452  * The caller gets error and return status for each aiocb in the list
1453  * via aio_error and aio_return.  We must keep completed requests until
1454  * released by the aio_return call.
1455  */
1456 int
1457 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1458 {
1459         int                             i;
1460         int                             call_result;
1461         int                             result;
1462         int                             old_count;
1463         aio_workq_entry                 **entryp_listp;
1464         user_addr_t                     *aiocbpp;
1465         struct user_sigevent            aiosigev;
1466         aio_lio_context         *lio_context;
1467         boolean_t                       free_context = FALSE;
1468     uint32_t *paio_offset;
1469     uint32_t *paio_nbytes;
1470
1471         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1472                           (int)p, uap->nent, uap->mode, 0, 0 );
1473
1474         entryp_listp = NULL;
1475         lio_context = NULL;
1476         aiocbpp = NULL;
1477         call_result = -1;
1478         *retval = -1;
1479         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1480                 call_result = EINVAL;
1481                 goto ExitRoutine;
1482         }
1483
1484         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1485                 call_result = EINVAL;
1486                 goto ExitRoutine;
1487         }
1488
1489         /*
1490          * allocate a list of aio_workq_entry pointers that we will use
1491          * to queue up all our requests at once while holding our lock.
1492          */
1493         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1494         if ( entryp_listp == NULL ) {
1495                 call_result = EAGAIN;
1496                 goto ExitRoutine;
1497         }
1498
1499         MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1500         if ( lio_context == NULL ) {
1501                 call_result = EAGAIN;
1502                 goto ExitRoutine;
1503         }
1504
1505 #if DEBUG
1506         OSIncrementAtomic(&lio_contexts_alloced);
1507 #endif /* DEBUG */
1508
1509         free_context = TRUE;
1510         bzero(lio_context, sizeof(aio_lio_context));
1511
1512         aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1513         if ( aiocbpp == NULL ) {
1514                 call_result = EAGAIN;
1515                 goto ExitRoutine;
1516         }
1517
1518         /*
1519          * Use sigevent passed in to lio_listio for each of our calls, but
1520          * only do completion notification after the last request completes.
1521          */
1522         bzero(&aiosigev, sizeof(aiosigev));
1523         /* Only copy in an sigev if the user supplied one */
1524         if (uap->sigp != USER_ADDR_NULL) {
1525                 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1526                 if ( call_result)
1527                         goto ExitRoutine;
1528         }
1529
1530         /* process list of aio requests */
1531         free_context = FALSE;
1532         lio_context->io_issued = uap->nent;
1533         lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
1534         for ( i = 0; i < uap->nent; i++ ) {
1535                 user_addr_t my_aiocbp;
1536                 aio_workq_entry                         *entryp;
1537
1538                 *(entryp_listp + i) = NULL;
1539                 my_aiocbp = *(aiocbpp + i);
1540
1541                 /* NULL elements are legal so check for 'em */
1542                 if ( my_aiocbp == USER_ADDR_NULL ) {
1543                         aio_proc_lock_spin(p);
1544                         lio_context->io_issued--;
1545                         aio_proc_unlock(p);
1546                         continue;
1547                 }
1548
1549                 /*
1550                  * We use lio_context to mark IO requests for delayed completion
1551                  * processing which means we wait until all IO requests in the
1552                  * group have completed before we either return to the caller
1553                  * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1554                  *
1555                  * We use the address of the lio_context for this, since it is
1556                  * unique in the address space.
1557                  */
1558                 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
1559                 if ( result != 0 && call_result == -1 )
1560                         call_result = result;
1561
1562                 /* NULL elements are legal so check for 'em */
1563                 entryp = *(entryp_listp + i);
1564                 if ( entryp == NULL ) {
1565                         aio_proc_lock_spin(p);
1566                         lio_context->io_issued--;
1567                         aio_proc_unlock(p);
1568                         continue;
1569                 }
1570
1571                 if ( uap->mode == LIO_NOWAIT ) {
1572                         /* Set signal hander, if any */
1573                         entryp->aiocb.aio_sigevent = aiosigev;
1574                 } else {
1575                         /* flag that this thread blocks pending completion */
1576                         entryp->flags |= AIO_LIO_NOTIFY;
1577                 }
1578
1579                 /* check our aio limits to throttle bad or rude user land behavior */
1580                 old_count = aio_increment_total_count();
1581
1582                 aio_proc_lock_spin(p);
1583                 if ( old_count >= aio_max_requests ||
1584                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1585                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1586
1587                         lio_context->io_issued--;
1588                         aio_proc_unlock(p);
1589
1590                         aio_decrement_total_count();
1591
1592                         if ( call_result == -1 )
1593                                 call_result = EAGAIN;
1594                         aio_free_request(entryp);
1595                         entryp_listp[i] = NULL;
1596                         continue;
1597                 }
1598
1599                 lck_mtx_convert_spin(aio_proc_mutex(p));
1600                 aio_enqueue_work(p, entryp, 1);
1601                 aio_proc_unlock(p);
1602
1603         KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_START,
1604                       (int)p, (int)entryp->uaiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
1605         paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
1606         paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
1607         KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_END,
1608                               paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
1609                               paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
1610                               0 );
1611     }
1612
1613         switch(uap->mode) {
1614         case LIO_WAIT:
1615                 aio_proc_lock_spin(p);
1616                 while (lio_context->io_completed < lio_context->io_issued) {
1617                         result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
1618
1619                         /* If we were interrupted, fail out (even if all finished) */
1620                         if (result != 0) {
1621                                 call_result = EINTR;
1622                                 lio_context->io_waiter = 0;
1623                                 break;
1624                         }
1625                 }
1626
1627                 /* If all IOs have finished must free it */
1628                 if (lio_context->io_completed == lio_context->io_issued) {
1629                         free_context = TRUE;
1630                 }
1631
1632                 aio_proc_unlock(p);
1633                 break;
1634
1635         case LIO_NOWAIT:
1636                 break;
1637         }
1638
1639         /* call_result == -1 means we had no trouble queueing up requests */
1640         if ( call_result == -1 ) {
1641                 call_result = 0;
1642                 *retval = 0;
1643         }
1644
1645 ExitRoutine:
1646         if ( entryp_listp != NULL )
1647                 FREE( entryp_listp, M_TEMP );
1648         if ( aiocbpp != NULL )
1649                 FREE( aiocbpp, M_TEMP );
1650         if (free_context) {
1651                 free_lio_context(lio_context);
1652         }
1653
1654         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1655                           (int)p, call_result, 0, 0, 0 );
1656
1657         return( call_result );
1658
1659 } /* lio_listio */
1660
1661
1662 /*
1663  * aio worker thread.  this is where all the real work gets done.
1664  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1665  * after new work is queued up.
1666  */
1667 __attribute__((noreturn))
1668 static void
1669 aio_work_thread(void)
1670 {
1671         aio_workq_entry                 *entryp;
1672         int                     error;
1673         vm_map_t                currentmap;
1674         vm_map_t                oldmap = VM_MAP_NULL;
1675         task_t                  oldaiotask = TASK_NULL;
1676         struct uthread  *uthreadp = NULL;
1677
1678         for( ;; ) {
1679                 /*
1680                  * returns with the entry ref'ed.
1681                  * sleeps until work is available.
1682                  */
1683                 entryp = aio_get_some_work();
1684
1685                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1686                                 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1687
1688                 /*
1689                  * Assume the target's address space identity for the duration
1690                  * of the IO.  Note: don't need to have the entryp locked,
1691                  * because the proc and map don't change until it's freed.
1692                  */
1693                 currentmap = get_task_map( (current_proc())->task );
1694                 if ( currentmap != entryp->aio_map ) {
1695                         uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1696                         oldaiotask = uthreadp->uu_aio_task;
1697                         uthreadp->uu_aio_task = entryp->procp->task;
1698                         oldmap = vm_map_switch( entryp->aio_map );
1699                 }
1700
1701                 if ( (entryp->flags & AIO_READ) != 0 ) {
1702                         error = do_aio_read( entryp );
1703                 }
1704                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1705                         error = do_aio_write( entryp );
1706                 }
1707                 else if ( (entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0 ) {
1708                         error = do_aio_fsync( entryp );
1709                 }
1710                 else {
1711                         printf( "%s - unknown aio request - flags 0x%02X \n",
1712                                         __FUNCTION__, entryp->flags );
1713                         error = EINVAL;
1714                 }
1715
1716                 /* Restore old map */
1717                 if ( currentmap != entryp->aio_map ) {
1718                         (void) vm_map_switch( oldmap );
1719                         uthreadp->uu_aio_task = oldaiotask;
1720                 }
1721
1722                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1723                                 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1724                                 entryp->returnval, 0 );
1725
1726
1727                 /* XXX COUNTS */
1728                 aio_entry_lock_spin(entryp);
1729                 entryp->errorval = error;
1730                 aio_entry_unlock(entryp);
1731
1732                 /* we're done with the IO request so pop it off the active queue and */
1733                 /* push it on the done queue */
1734                 aio_proc_lock(entryp->procp);
1735                 aio_proc_move_done_locked(entryp->procp, entryp);
1736                 aio_proc_unlock(entryp->procp);
1737
1738                 OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1739
1740                 /* remove our reference to the user land map. */
1741                 if ( VM_MAP_NULL != entryp->aio_map ) {
1742                         vm_map_t                my_map;
1743
1744                         my_map = entryp->aio_map;
1745                         entryp->aio_map = VM_MAP_NULL;
1746                         vm_map_deallocate( my_map );
1747                 }
1748
1749                 /* Provide notifications */
1750                 do_aio_completion( entryp );
1751
1752                 /* Will free if needed */
1753                 aio_entry_unref(entryp);
1754
1755         } /* for ( ;; ) */
1756
1757         /* NOT REACHED */
1758
1759 } /* aio_work_thread */
1760
1761
1762 /*
1763  * aio_get_some_work - get the next async IO request that is ready to be executed.
1764  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1765  * IO requests at the time the aio_fsync call came in have completed.
1766  * NOTE - AIO_LOCK must be held by caller
1767  */
1768 static aio_workq_entry *
1769 aio_get_some_work( void )
1770 {
1771         aio_workq_entry                         *entryp = NULL;
1772         aio_workq_t                             queue = NULL;
1773
1774         /* Just one queue for the moment.  In the future there will be many. */
1775         queue = &aio_anchor.aio_async_workqs[0];
1776         aio_workq_lock_spin(queue);
1777         if (queue->aioq_count == 0) {
1778                 goto nowork;
1779         }
1780
1781         /*
1782          * Hold the queue lock.
1783          *
1784          * pop some work off the work queue and add to our active queue
1785          * Always start with the queue lock held.
1786          */
1787         for(;;) {
1788                 /*
1789                  * Pull of of work queue.  Once it's off, it can't be cancelled,
1790                  * so we can take our ref once we drop the queue lock.
1791                  */
1792                 entryp = TAILQ_FIRST(&queue->aioq_entries);
1793
1794                 /*
1795                  * If there's no work or only fsyncs that need delay, go to sleep
1796                  * and then start anew from aio_work_thread
1797                  */
1798                 if (entryp == NULL) {
1799                         goto nowork;
1800                 }
1801
1802                 aio_workq_remove_entry_locked(queue, entryp);
1803
1804                 aio_workq_unlock(queue);
1805
1806                 /*
1807                  * Check if it's an fsync that must be delayed.  No need to lock the entry;
1808                  * that flag would have been set at initialization.
1809                  */
1810                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1811                         /*
1812                          * Check for unfinished operations on the same file
1813                          * in this proc's queue.
1814                          */
1815                         aio_proc_lock_spin(entryp->procp);
1816                         if ( aio_delay_fsync_request( entryp ) ) {
1817                                 /* It needs to be delayed.  Put it back on the end of the work queue */
1818                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1819                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1820
1821                                 aio_proc_unlock(entryp->procp);
1822
1823                                 aio_workq_lock_spin(queue);
1824                                 aio_workq_add_entry_locked(queue, entryp);
1825                                 continue;
1826                         }
1827                         aio_proc_unlock(entryp->procp);
1828                 }
1829
1830                 break;
1831         }
1832
1833         aio_entry_ref(entryp);
1834
1835         OSIncrementAtomic(&aio_anchor.aio_inflight_count);
1836         return( entryp );
1837
1838 nowork:
1839         /* We will wake up when someone enqueues something */
1840         waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
1841         aio_workq_unlock(queue);
1842         thread_block( (thread_continue_t)aio_work_thread );
1843
1844         // notreached
1845         return NULL;
1846 }
1847
1848 /*
1849  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1850  * A big, simple hammer: only send it off if it's the most recently filed IO which has
1851  * not been completed.
1852  */
1853 static boolean_t
1854 aio_delay_fsync_request( aio_workq_entry *entryp )
1855 {
1856         if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1857                 return FALSE;
1858         }
1859
1860         return TRUE;
1861 } /* aio_delay_fsync_request */
1862
1863 static aio_workq_entry *
1864 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
1865 {
1866         aio_workq_entry *entryp;
1867         int             result = 0;
1868
1869         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1870         if ( entryp == NULL ) {
1871                 result = EAGAIN;
1872                 goto error_exit;
1873         }
1874
1875         bzero( entryp, sizeof(*entryp) );
1876
1877         /* fill in the rest of the aio_workq_entry */
1878         entryp->procp = procp;
1879         entryp->uaiocbp = aiocbp;
1880         entryp->flags |= kindOfIO;
1881         entryp->group_tag = group_tag;
1882         entryp->aio_map = VM_MAP_NULL;
1883         entryp->aio_refcount = 0;
1884
1885         if ( proc_is64bit(procp) ) {
1886                 struct user64_aiocb aiocb64;
1887
1888                 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
1889                 if (result == 0 )
1890                         do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1891
1892         } else {
1893                 struct user32_aiocb aiocb32;
1894
1895                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1896                 if ( result == 0 )
1897                         do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
1898         }
1899
1900         if ( result != 0 ) {
1901                 result = EAGAIN;
1902                 goto error_exit;
1903         }
1904
1905         /* get a reference to the user land map in order to keep it around */
1906         entryp->aio_map = get_task_map( procp->task );
1907         vm_map_reference( entryp->aio_map );
1908
1909         /* do some more validation on the aiocb and embedded file descriptor */
1910         result = aio_validate( entryp );
1911         if ( result != 0 )
1912                 goto error_exit_with_ref;
1913
1914         /* get a reference on the current_thread, which is passed in vfs_context. */
1915         entryp->thread = current_thread();
1916         thread_reference( entryp->thread );
1917         return ( entryp );
1918
1919 error_exit_with_ref:
1920         if ( VM_MAP_NULL != entryp->aio_map ) {
1921                 vm_map_deallocate( entryp->aio_map );
1922         }
1923 error_exit:
1924         if ( result && entryp != NULL ) {
1925                 zfree( aio_workq_zonep, entryp );
1926                 entryp = NULL;
1927         }
1928
1929         return ( entryp );
1930 }
1931
1932
1933 /*
1934  * aio_queue_async_request - queue up an async IO request on our work queue then
1935  * wake up one of our worker threads to do the actual work.  We get a reference
1936  * to our caller's user land map in order to keep it around while we are
1937  * processing the request.
1938  */
1939 static int
1940 aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1941 {
1942         aio_workq_entry *entryp;
1943         int              result;
1944         int              old_count;
1945     uint32_t *paio_offset;
1946     uint32_t *paio_nbytes;
1947
1948         old_count = aio_increment_total_count();
1949         if (old_count >= aio_max_requests) {
1950                 result = EAGAIN;
1951                 goto error_noalloc;
1952         }
1953
1954         entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1955         if ( entryp == NULL ) {
1956                 result = EAGAIN;
1957                 goto error_noalloc;
1958         }
1959
1960
1961         aio_proc_lock_spin(procp);
1962
1963         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1964                 result = EAGAIN;
1965                 goto error_exit;
1966         }
1967
1968         /* check our aio limits to throttle bad or rude user land behavior */
1969         if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
1970                 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
1971                 result = EAGAIN;
1972                 goto error_exit;
1973         }
1974
1975         /* Add the IO to proc and work queues, wake up threads as appropriate */
1976         lck_mtx_convert_spin(aio_proc_mutex(procp));
1977         aio_enqueue_work(procp, entryp, 1);
1978
1979         aio_proc_unlock(procp);
1980
1981     paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
1982     paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
1983     KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_START,
1984                  (int)procp, (int)aiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
1985     KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_END,
1986                           paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
1987                           paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
1988                           0 );
1989
1990         return( 0 );
1991
1992 error_exit:
1993         /*
1994          * This entry has not been queued up so no worries about
1995          * unlocked state and aio_map
1996          */
1997         aio_proc_unlock(procp);
1998         aio_free_request(entryp);
1999
2000 error_noalloc:
2001         aio_decrement_total_count();
2002
2003         return( result );
2004
2005 } /* aio_queue_async_request */
2006
2007
2008 /*
2009  * lio_create_entry
2010  *
2011  * Allocate an aio_workq_entry and fill it in.  If all goes well return 0
2012  * and pass the aio_workq_entry pointer back to our caller.
2013  *
2014  * Parameters:  procp                   The process makign the request
2015  *              aiocbp                  The aio context buffer pointer
2016  *              group_tag               The group tag used to indicate a
2017  *                                      group of operations has completed
2018  *              entrypp                 Pointer to the pointer to receive the
2019  *                                      address of the created aio_workq_entry
2020  *
2021  * Returns:     0                       Successfully created
2022  *              EAGAIN                  Try again (usually resource shortage)
2023  *
2024  *
2025  * Notes:       We get a reference to our caller's user land map in order
2026  *              to keep it around while we are processing the request.
2027  *
2028  *              lio_listio calls behave differently at completion they do
2029  *              completion notification when all async IO requests have
2030  *              completed.  We use group_tag to tag IO requests that behave
2031  *              in the delay notification manner.
2032  *
2033  *              All synchronous operations are considered to not have a
2034  *              signal routine associated with them (sigp == USER_ADDR_NULL).
2035  */
2036 static int
2037 lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2038                 aio_workq_entry **entrypp )
2039 {
2040         aio_workq_entry *entryp;
2041         int             result;
2042
2043         entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2044         if ( entryp == NULL ) {
2045                 result = EAGAIN;
2046                 goto error_exit;
2047         }
2048
2049         /*
2050          * Look for lio_listio LIO_NOP requests and ignore them; this is
2051          * not really an error, but we need to free our aio_workq_entry.
2052          */
2053         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
2054                 result = 0;
2055                 goto error_exit;
2056         }
2057
2058         *entrypp = entryp;
2059         return( 0 );
2060
2061 error_exit:
2062
2063         if ( entryp != NULL ) {
2064                 /*
2065                  * This entry has not been queued up so no worries about
2066                  * unlocked state and aio_map
2067                  */
2068                 aio_free_request(entryp);
2069         }
2070
2071         return( result );
2072
2073 } /* lio_create_entry */
2074
2075
2076 /*
2077  * aio_free_request - remove our reference on the user land map and
2078  * free the work queue entry resources.  The entry is off all lists
2079  * and has zero refcount, so no one can have a pointer to it.
2080  */
2081
2082 static int
2083 aio_free_request(aio_workq_entry *entryp)
2084 {
2085         /* remove our reference to the user land map. */
2086         if ( VM_MAP_NULL != entryp->aio_map) {
2087                 vm_map_deallocate(entryp->aio_map);
2088         }
2089
2090         /* remove our reference to thread which enqueued the request */
2091         if ( NULL != entryp->thread ) {
2092                 thread_deallocate( entryp->thread );
2093         }
2094
2095         entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2096
2097         zfree( aio_workq_zonep, entryp );
2098
2099         return( 0 );
2100
2101 } /* aio_free_request */
2102
2103
2104 /*
2105  * aio_validate
2106  *
2107  * validate the aiocb passed in by one of the aio syscalls.
2108  */
2109 static int
2110 aio_validate( aio_workq_entry *entryp )
2111 {
2112         struct fileproc                                 *fp;
2113         int                                                     flag;
2114         int                                                     result;
2115
2116         result = 0;
2117
2118         if ( (entryp->flags & AIO_LIO) != 0 ) {
2119                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
2120                         entryp->flags |= AIO_READ;
2121                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
2122                         entryp->flags |= AIO_WRITE;
2123                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
2124                         return( 0 );
2125                 else
2126                         return( EINVAL );
2127         }
2128
2129         flag = FREAD;
2130         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0 ) {
2131                 flag = FWRITE;
2132         }
2133
2134         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
2135                 if ( entryp->aiocb.aio_nbytes > INT_MAX         ||
2136                          entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2137                          entryp->aiocb.aio_offset < 0 )
2138                         return( EINVAL );
2139         }
2140
2141         /*
2142          * validate aiocb.aio_sigevent.  at this point we only support
2143          * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
2144          * sigev_value, sigev_notify_function, and sigev_notify_attributes
2145          * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
2146          * with no [RTS] (RalTime Signal) option group support.
2147          */
2148         switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
2149         case SIGEV_SIGNAL:
2150             {
2151                 int             signum;
2152
2153                 /* make sure we have a valid signal number */
2154                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
2155                 if ( signum <= 0 || signum >= NSIG ||
2156                          signum == SIGKILL || signum == SIGSTOP )
2157                         return (EINVAL);
2158             }
2159             break;
2160
2161         case SIGEV_NONE:
2162                 break;
2163
2164         case SIGEV_THREAD:
2165                 /* Unsupported [RTS] */
2166
2167         default:
2168                 return (EINVAL);
2169         }
2170
2171         /* validate the file descriptor and that the file was opened
2172          * for the appropriate read / write access.
2173          */
2174         proc_fdlock(entryp->procp);
2175
2176         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
2177         if ( result == 0 ) {
2178                 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
2179                         /* we don't have read or write access */
2180                         result = EBADF;
2181                 }
2182                 else if ( FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE ) {
2183                         /* this is not a file */
2184                         result = ESPIPE;
2185                 } else
2186                         fp->f_flags |= FP_AIOISSUED;
2187
2188                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
2189         }
2190         else {
2191                 result = EBADF;
2192         }
2193
2194         proc_fdunlock(entryp->procp);
2195
2196         return( result );
2197
2198 } /* aio_validate */
2199
2200 static int
2201 aio_increment_total_count()
2202 {
2203         return OSIncrementAtomic(&aio_anchor.aio_total_count);
2204 }
2205
2206 static int
2207 aio_decrement_total_count()
2208 {
2209         int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2210         if (old <= 0) {
2211                 panic("Negative total AIO count!\n");
2212         }
2213
2214         return old;
2215 }
2216
2217 static int
2218 aio_get_process_count(proc_t procp )
2219 {
2220         return procp->p_aio_total_count;
2221
2222 } /* aio_get_process_count */
2223
2224 static int
2225 aio_get_all_queues_count( void )
2226 {
2227         return aio_anchor.aio_total_count;
2228
2229 } /* aio_get_all_queues_count */
2230
2231
2232 /*
2233  * do_aio_completion.  Handle async IO completion.
2234  */
2235 static void
2236 do_aio_completion( aio_workq_entry *entryp )
2237 {
2238
2239         boolean_t               lastLioCompleted = FALSE;
2240         aio_lio_context *lio_context = NULL;
2241         int waiter = 0;
2242
2243         lio_context = (aio_lio_context *)entryp->group_tag;
2244
2245         if (lio_context != NULL) {
2246
2247                 aio_proc_lock_spin(entryp->procp);
2248
2249                 /* Account for this I/O completing. */
2250                 lio_context->io_completed++;
2251
2252                 /* Are we done with this lio context? */
2253                 if (lio_context->io_issued == lio_context->io_completed) {
2254                         lastLioCompleted = TRUE;
2255                 }
2256
2257                 waiter = lio_context->io_waiter;
2258
2259                 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2260                 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2261                         /* wake up the waiter */
2262                         wakeup(lio_context);
2263                 }
2264
2265                 aio_proc_unlock(entryp->procp);
2266         }
2267
2268         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2269                  (entryp->flags & AIO_DISABLE) == 0 ) {
2270
2271                 boolean_t       performSignal = FALSE;
2272                  if (lio_context == NULL) {
2273                         performSignal = TRUE;
2274                  }
2275                  else {
2276                         /*
2277                          * If this was the last request in the group and a signal
2278                          * is desired, send one.
2279                          */
2280                         performSignal = lastLioCompleted;
2281                  }
2282
2283                  if (performSignal) {
2284
2285                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
2286                                  (int)entryp->procp, (int)entryp->uaiocbp,
2287                                  entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
2288
2289                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
2290                 }
2291         }
2292
2293         if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2294                 panic("Close and exit flags set at the same time\n");
2295         }
2296
2297         /*
2298          * need to handle case where a process is trying to exit, exec, or
2299          * close and is currently waiting for active aio requests to complete.
2300          * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2301          * other requests in the active queue for this process.  If there are
2302          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2303          * If there are some still active then do nothing - we only want to
2304          * wakeup when all active aio requests for the process are complete.
2305          *
2306          * Don't need to lock the entry or proc to check the cleanup flag.  It can only be
2307          * set for cancellation, while the entryp is still on a proc list; now it's
2308          * off, so that flag is already set if it's going to be.
2309          */
2310         if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) {
2311                 int             active_requests;
2312
2313                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2314                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2315
2316                 aio_proc_lock_spin(entryp->procp);
2317                 active_requests = aio_active_requests_for_process( entryp->procp );
2318                 if ( active_requests < 1 ) {
2319                         /*
2320                          * no active aio requests for this process, continue exiting.  In this
2321                          * case, there should be no one else waiting ont he proc in AIO...
2322                          */
2323                         wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2324                         aio_proc_unlock(entryp->procp);
2325
2326                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2327                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2328                 } else {
2329                         aio_proc_unlock(entryp->procp);
2330                 }
2331         }
2332
2333         if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) {
2334                 int             active_requests;
2335
2336                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2337                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2338
2339                 aio_proc_lock_spin(entryp->procp);
2340                 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2341                 if ( active_requests < 1 ) {
2342                         /* Can't wakeup_one(); multiple closes might be in progress. */
2343                         wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2344                         aio_proc_unlock(entryp->procp);
2345
2346                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2347                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2348                 } else {
2349                         aio_proc_unlock(entryp->procp);
2350                 }
2351         }
2352         /*
2353          * A thread in aio_suspend() wants to known about completed IOs.  If it checked
2354          * the done list before we moved our AIO there, then it already asserted its wait,
2355          * and we can wake it up without holding the lock.  If it checked the list after
2356          * we did our move, then it already has seen the AIO that we moved.  Herego, we
2357          * can do our wakeup without holding the lock.
2358          */
2359         wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
2360         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2361                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2362
2363         /*
2364          * free the LIO context if the last lio completed and no thread is
2365          * waiting
2366          */
2367         if (lastLioCompleted && (waiter == 0))
2368                 free_lio_context (lio_context);
2369
2370
2371 } /* do_aio_completion */
2372
2373
2374 /*
2375  * do_aio_read
2376  */
2377 static int
2378 do_aio_read( aio_workq_entry *entryp )
2379 {
2380         struct fileproc         *fp;
2381         int                                     error;
2382         struct vfs_context      context;
2383
2384         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2385                 return(error);
2386         if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2387                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2388                 return(EBADF);
2389         }
2390
2391         context.vc_thread = entryp->thread;     /* XXX */
2392         context.vc_ucred = fp->f_fglob->fg_cred;
2393
2394         error = dofileread(&context, fp,
2395                                 entryp->aiocb.aio_buf,
2396                                 entryp->aiocb.aio_nbytes,
2397                                 entryp->aiocb.aio_offset, FOF_OFFSET,
2398                                 &entryp->returnval);
2399         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2400
2401         return( error );
2402
2403 } /* do_aio_read */
2404
2405
2406 /*
2407  * do_aio_write
2408  */
2409 static int
2410 do_aio_write( aio_workq_entry *entryp )
2411 {
2412         struct fileproc                 *fp;
2413         int                             error, flags;
2414         struct vfs_context              context;
2415
2416         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2417                 return(error);
2418         if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2419                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2420                 return(EBADF);
2421         }
2422
2423         flags = FOF_PCRED;
2424         if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) {
2425                 flags |= FOF_OFFSET;
2426         }
2427
2428         context.vc_thread = entryp->thread;     /* XXX */
2429         context.vc_ucred = fp->f_fglob->fg_cred;
2430
2431         /* NB: tell dofilewrite the offset, and to use the proc cred */
2432         error = dofilewrite(&context,
2433                                 fp,
2434                                 entryp->aiocb.aio_buf,
2435                                 entryp->aiocb.aio_nbytes,
2436                                 entryp->aiocb.aio_offset,
2437                                 flags,
2438                                 &entryp->returnval);
2439
2440         if (entryp->returnval)
2441                 fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
2442         else
2443                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2444
2445         return( error );
2446
2447 } /* do_aio_write */
2448
2449
2450 /*
2451  * aio_active_requests_for_process - return number of active async IO
2452  * requests for the given process.
2453  */
2454 static int
2455 aio_active_requests_for_process(proc_t procp )
2456 {
2457         return( procp->p_aio_active_count );
2458
2459 } /* aio_active_requests_for_process */
2460
2461 /*
2462  * Called with the proc locked.
2463  */
2464 static int
2465 aio_proc_active_requests_for_file(proc_t procp, int fd)
2466 {
2467         int count = 0;
2468         aio_workq_entry *entryp;
2469         TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2470                 if (entryp->aiocb.aio_fildes == fd) {
2471                         count++;
2472                 }
2473         }
2474
2475         return count;
2476 } /* aio_active_requests_for_process */
2477
2478
2479
2480 /*
2481  * do_aio_fsync
2482  */
2483 static int
2484 do_aio_fsync( aio_workq_entry *entryp )
2485 {
2486         struct vfs_context      context;
2487         struct vnode            *vp;
2488         struct fileproc         *fp;
2489         int                     sync_flag;
2490         int                     error;
2491
2492         /*
2493          * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2494          *
2495          * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2496          * to mark for update the metadata not strictly necessary for data
2497          * retrieval, rather than forcing it to disk.
2498          *
2499          * If AIO_FSYNC is set, we have to also wait for metadata not really
2500          * necessary to data retrival are committed to stable storage (e.g.
2501          * atime, mtime, ctime, etc.).
2502          *
2503          * Metadata necessary for data retrieval ust be committed to stable
2504          * storage in either case (file length, etc.).
2505          */
2506         if (entryp->flags & AIO_FSYNC)
2507                 sync_flag = MNT_WAIT;
2508         else
2509                 sync_flag = MNT_DWAIT;
2510
2511         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2512         if ( error == 0 ) {
2513                 if ( (error = vnode_getwithref(vp)) ) {
2514                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2515                         entryp->returnval = -1;
2516                         return(error);
2517                 }
2518                 context.vc_thread = current_thread();
2519                 context.vc_ucred = fp->f_fglob->fg_cred;
2520
2521                 error = VNOP_FSYNC( vp, sync_flag, &context);
2522
2523                 (void)vnode_put(vp);
2524
2525                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2526         }
2527         if ( error != 0 )
2528                 entryp->returnval = -1;
2529
2530         return( error );
2531
2532 } /* do_aio_fsync */
2533
2534
2535 /*
2536  * is_already_queued - runs through our queues to see if the given
2537  * aiocbp / process is there.  Returns TRUE if there is a match
2538  * on any of our aio queues.
2539  *
2540  * Called with proc aio lock held (can be held spin)
2541  */
2542 static boolean_t
2543 is_already_queued(proc_t procp,
2544                                         user_addr_t aiocbp )
2545 {
2546         aio_workq_entry                 *entryp;
2547         boolean_t                               result;
2548
2549         result = FALSE;
2550
2551         /* look for matches on our queue of async IO requests that have completed */
2552         TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
2553                 if ( aiocbp == entryp->uaiocbp ) {
2554                         result = TRUE;
2555                         goto ExitThisRoutine;
2556                 }
2557         }
2558
2559         /* look for matches on our queue of active async IO requests */
2560         TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
2561                 if ( aiocbp == entryp->uaiocbp ) {
2562                         result = TRUE;
2563                         goto ExitThisRoutine;
2564                 }
2565         }
2566
2567 ExitThisRoutine:
2568         return( result );
2569
2570 } /* is_already_queued */
2571
2572
2573 static void
2574 free_lio_context(aio_lio_context* context)
2575 {
2576
2577 #if DEBUG
2578         OSDecrementAtomic(&lio_contexts_alloced);
2579 #endif /* DEBUG */
2580
2581         FREE( context, M_TEMP );
2582
2583 } /* free_lio_context */
2584
2585
2586 /*
2587  * aio initialization
2588  */
2589 __private_extern__ void
2590 aio_init( void )
2591 {
2592         int                     i;
2593
2594         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2595         aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2596         aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2597         aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
2598         aio_lock_attr = lck_attr_alloc_init();
2599
2600         lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2601         lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
2602
2603         aio_anchor.aio_inflight_count = 0;
2604         aio_anchor.aio_done_count = 0;
2605         aio_anchor.aio_total_count = 0;
2606         aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2607
2608         for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2609                 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2610         }
2611
2612
2613         i = sizeof( aio_workq_entry );
2614         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2615
2616         _aio_create_worker_threads( aio_worker_threads );
2617
2618 } /* aio_init */
2619
2620
2621 /*
2622  * aio worker threads created here.
2623  */
2624 __private_extern__ void
2625 _aio_create_worker_threads( int num )
2626 {
2627         int                     i;
2628
2629         /* create some worker threads to handle the async IO requests */
2630         for ( i = 0; i < num; i++ ) {
2631                 thread_t                myThread;
2632
2633                 if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
2634                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2635                 }
2636                 else
2637                         thread_deallocate(myThread);
2638         }
2639
2640         return;
2641
2642 } /* _aio_create_worker_threads */
2643
2644 /*
2645  * Return the current activation utask
2646  */
2647 task_t
2648 get_aiotask(void)
2649 {
2650         return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2651 }
2652
2653
2654 /*
2655  * In the case of an aiocb from a
2656  * 32-bit process we need to expand some longs and pointers to the correct
2657  * sizes in order to let downstream code always work on the same type of
2658  * aiocb (in our case that is a user_aiocb)
2659  */
2660 static void
2661 do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2662 {
2663         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2664         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2665         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2666         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2667         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2668         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2669
2670         /* special case here.  since we do not know if sigev_value is an */
2671         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2672         /* means if we send this info back to user space we need to remember */
2673         /* sigev_value was not expanded for the 32-bit case.  */
2674         /* NOTE - this does NOT affect us since we don't support sigev_value */
2675         /* yet in the aio context.  */
2676         //LP64
2677         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2678         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2679         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2680                 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2681         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2682                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2683         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2684                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2685 }
2686
2687 /* Similar for 64-bit user process, so that we don't need to satisfy
2688  * the alignment constraints of the original user64_aiocb
2689  */
2690 static void
2691 do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2692 {
2693         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2694         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2695         the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2696         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2697         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2698         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2699
2700         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2701         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2702         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2703                 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2704         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2705                 my_aiocbp->aio_sigevent.sigev_notify_function;
2706         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2707                 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2708 }