bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25
  26
  27 /*
  28  * todo:
  29  *              1) ramesh is looking into how to replace taking a reference on
  30  *                      the user's map (vm_map_reference()) since it is believed that
  31  *                      would not hold the process for us.
  32  *              2) david is looking into a way for us to set the priority of the
  33  *                      worker threads to match that of the user's thread when the
  34  *                      async IO was queued.
  35  */
  36
  37
  38 /*
  39  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  40  */
  41
  42 #include <sys/systm.h>
  43 #include <sys/buf.h>
  44 #include <sys/fcntl.h>
  45 #include <sys/file.h>
  46 #include <sys/filedesc.h>
  47 #include <sys/kernel.h>
  48 #include <sys/vnode.h>
  49 #include <sys/malloc.h>
  50 #include <sys/mount.h>
  51 #include <sys/param.h>
  52 #include <sys/proc.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/unistd.h>
  55 #include <sys/user.h>
  56
  57 #include <sys/aio_kern.h>
  58
  59 #include <machine/limits.h>
  60 #include <kern/zalloc.h>
  61 #include <kern/task.h>
  62
  63 #include <sys/kdebug.h>
  64 #define AIO_work_queued                                 1
  65 #define AIO_worker_wake                                 2
  66 #define AIO_completion_sig                              3
  67 #define AIO_completion_cleanup_wait             4
  68 #define AIO_completion_cleanup_wake             5
  69 #define AIO_completion_suspend_wake     6
  70 #define AIO_fsync_delay                                 7
  71 #define AIO_cancel                                              10
  72 #define AIO_cancel_async_workq                  11
  73 #define AIO_cancel_sync_workq                   12
  74 #define AIO_cancel_activeq                              13
  75 #define AIO_cancel_doneq                                14
  76 #define AIO_fsync                                               20
  77 #define AIO_read                                                30
  78 #define AIO_write                                               40
  79 #define AIO_listio                                              50
  80 #define AIO_error                                               60
  81 #define AIO_error_val                                   61
  82 #define AIO_error_activeq                               62
  83 #define AIO_error_workq                                 63
  84 #define AIO_return                                              70
  85 #define AIO_return_val                                  71
  86 #define AIO_return_activeq                              72
  87 #define AIO_return_workq                                73
  88 #define AIO_exec                                                80
  89 #define AIO_exit                                                90
  90 #define AIO_exit_sleep                                  91
  91 #define AIO_close                                               100
  92 #define AIO_close_sleep                                 101
  93 #define AIO_suspend                                             110
  94 #define AIO_suspend_sleep                               111
  95 #define AIO_worker_thread                               120
  96
  97 #if 0
  98 #undef KERNEL_DEBUG
  99 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 100 #endif
 101
 102 /*
 103  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 104  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 105  * (proc.aio_activeq) when one of our worker threads start the IO.
 106  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 107  * when the IO request completes.  The request remains on aio_doneq until
 108  * user process calls aio_return or the process exits, either way that is our
 109  * trigger to release aio resources.
 110  */
 111 struct aio_anchor_cb
 112 {
 113         int                                                                     aio_async_workq_count;  /* entries on aio_async_workq */
 114         int                                                                     lio_sync_workq_count;   /* entries on lio_sync_workq */
 115         int                                                                     aio_active_count;       /* entries on all active queues (proc.aio_activeq) */
 116         int                                                                     aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 117         TAILQ_HEAD( , aio_workq_entry )         aio_async_workq;
 118         TAILQ_HEAD( , aio_workq_entry )         lio_sync_workq;
 119 };
 120 typedef struct aio_anchor_cb aio_anchor_cb;
 121
 122
 123 /*
 124  * Notes on aio sleep / wake channels.
 125  * We currently pick a couple fields within the proc structure that will allow
 126  * us sleep channels that currently do not collide with any other kernel routines.
 127  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 128  */
 129 #define AIO_SUSPEND_SLEEP_CHAN  p_estcpu
 130 #define AIO_CLEANUP_SLEEP_CHAN  p_pctcpu
 131
 132
 133 /*
 134  * aysnc IO locking macros used to protect critical sections.
 135  */
 136 #define AIO_LOCK        usimple_lock( &aio_lock )
 137 #define AIO_UNLOCK      usimple_unlock( &aio_lock )
 138
 139
 140 /*
 141  *  LOCAL PROTOTYPES
 142  */
 143 static int                      aio_active_requests_for_process( struct proc *procp );
 144 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 145 static int                      aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
 146 static int                      aio_get_all_queues_count( void );
 147 static int                      aio_get_process_count( struct proc *procp );
 148 static aio_workq_entry *  aio_get_some_work( void );
 149 static boolean_t        aio_last_group_io( aio_workq_entry *entryp );
 150 static void                     aio_mark_requests( aio_workq_entry *entryp );
 151 static int                      aio_queue_async_request( struct proc *procp,
 152                                                                                          struct aiocb *aiocbp,
 153                                                                                          int kindOfIO );
 154 static int                      aio_validate( aio_workq_entry *entryp );
 155 static void                     aio_work_thread( void );
 156 static int                      do_aio_cancel(  struct proc *p,
 157                                                                         int fd,
 158                                                                         struct aiocb *aiocbp,
 159                                                                         boolean_t wait_for_completion,
 160                                                                         boolean_t disable_notification );
 161 static void                     do_aio_completion( aio_workq_entry *entryp );
 162 static int                      do_aio_fsync( aio_workq_entry *entryp );
 163 static int                      do_aio_read( aio_workq_entry *entryp );
 164 static int                      do_aio_write( aio_workq_entry *entryp );
 165 static boolean_t        is_already_queued(      struct proc *procp,
 166                                                                                 struct aiocb *aiocbp );
 167 static int                      lio_create_async_entry( struct proc *procp,
 168                                                                                          struct aiocb *aiocbp,
 169                                                                                          struct sigevent *sigp,
 170                                                                                          long group_tag,
 171                                                                                          aio_workq_entry **entrypp );
 172 static int                      lio_create_sync_entry( struct proc *procp,
 173                                                                                         struct aiocb *aiocbp,
 174                                                                                         long group_tag,
 175                                                                                         aio_workq_entry **entrypp );
 176
 177 /*
 178  *  EXTERNAL PROTOTYPES
 179  */
 180
 181 /* in ...bsd/kern/sys_generic.c */
 182 extern struct file*     holdfp( struct filedesc* fdp, int fd, int flag );
 183 extern int                      dofileread( struct proc *p, struct file *fp, int fd,
 184                                                                 void *buf, size_t nbyte, off_t offset,
 185                                                                 int flags, int *retval );
 186 extern int                      dofilewrite( struct proc *p, struct file *fp, int fd,
 187                                                                  const void *buf, size_t nbyte, off_t offset,
 188                                                                  int flags, int *retval );
 189 extern vm_map_t         vm_map_switch( vm_map_t    map );
 190
 191
 192 /*
 193  * aio external global variables.
 194  */
 195 extern int aio_max_requests;                            /* AIO_MAX - configurable */
 196 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 197 extern int aio_worker_threads;                          /* AIO_THREAD_COUNT - configurable */
 198
 199
 200 /*
 201  * aio static variables.
 202  */
 203 static aio_anchor_cb            aio_anchor;
 204 static simple_lock_data_t       aio_lock;
 205 static struct zone              *aio_workq_zonep;
 206
 207
 208 /*
 209  * syscall input parameters
 210  */
 211 #ifndef _SYS_SYSPROTO_H_
 212
 213 struct  aio_cancel_args {
 214         int                             fd;
 215         struct aiocb    *aiocbp;
 216 };
 217
 218 struct  aio_error_args {
 219         struct aiocb                    *aiocbp;
 220 };
 221
 222 struct  aio_fsync_args {
 223         int                                             op;
 224         struct aiocb                    *aiocbp;
 225 };
 226
 227 struct  aio_read_args {
 228         struct aiocb                    *aiocbp;
 229 };
 230
 231 struct  aio_return_args {
 232         struct aiocb    *aiocbp;
 233 };
 234
 235 struct  aio_suspend_args {
 236         struct aiocb *const     *aiocblist;
 237         int                                             nent;
 238         const struct timespec   *timeoutp;
 239 };
 240
 241 struct  aio_write_args {
 242         struct aiocb                    *aiocbp;
 243 };
 244
 245 struct  lio_listio_args {
 246         int                                             mode;
 247         struct aiocb *const     *aiocblist;
 248         int                                             nent;
 249         struct sigevent                 *sigp;
 250 };
 251
 252 #endif /* _SYS_SYSPROTO_H_ */
 253
 254
 255 /*
 256  * aio_cancel - attempt to cancel one or more async IO requests currently
 257  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 258  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 259  * is NULL then all outstanding async IO request for the given file
 260  * descriptor are cancelled (if possible).
 261  */
 262
 263 int
 264 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
 265 {
 266         struct aiocb                            my_aiocb;
 267         int                                                     result;
 268         boolean_t                                       funnel_state;
 269
 270         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 271                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 272
 273         /* quick check to see if there are any async IO requests queued up */
 274         AIO_LOCK;
 275         result = aio_get_all_queues_count( );
 276         AIO_UNLOCK;
 277         if ( result < 1 ) {
 278                 result = EBADF;
 279                 goto ExitRoutine;
 280         }
 281
 282         *retval = -1;
 283         if ( uap->aiocbp != NULL ) {
 284                 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
 285                 if ( result != 0 ) {
 286                         result = EAGAIN;
 287                         goto ExitRoutine;
 288                 }
 289
 290                 /* NOTE - POSIX standard says a mismatch between the file */
 291                 /* descriptor passed in and the file descriptor embedded in */
 292                 /* the aiocb causes unspecified results.  We return EBADF in */
 293                 /* that situation.  */
 294                 if ( uap->fd != my_aiocb.aio_fildes ) {
 295                         result = EBADF;
 296                         goto ExitRoutine;
 297                 }
 298         }
 299
 300         /* current BSD code assumes funnel lock is held */
 301         funnel_state = thread_funnel_set( kernel_flock, TRUE );
 302         result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
 303         (void) thread_funnel_set( kernel_flock, funnel_state );
 304
 305         if ( result != -1 ) {
 306                 *retval = result;
 307                 result = 0;
 308                 goto ExitRoutine;
 309         }
 310
 311         result = EBADF;
 312
 313 ExitRoutine:
 314         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 315                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 316
 317         return( result );
 318
 319 } /* aio_cancel */
 320
 321
 322 /*
 323  * _aio_close - internal function used to clean up async IO requests for
 324  * a file descriptor that is closing.
 325  * NOTE - kernel funnel lock is held when we get called.
 326  * THIS MAY BLOCK.
 327  */
 328
 329 __private_extern__ void
 330 _aio_close( struct proc *p, int fd )
 331 {
 332         int                     error, count;
 333
 334         /* quick check to see if there are any async IO requests queued up */
 335         AIO_LOCK;
 336         count = aio_get_all_queues_count( );
 337         AIO_UNLOCK;
 338         if ( count < 1 )
 339                 return;
 340
 341         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 342                           (int)p, fd, 0, 0, 0 );
 343
 344         /* cancel all async IO requests on our todo queues for this file descriptor */
 345         error = do_aio_cancel( p, fd, NULL, TRUE, FALSE );
 346         if ( error == AIO_NOTCANCELED ) {
 347                 /*
 348                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 349                  * and file descriptor on the active async IO queue.  Active requests cannot
 350                  * be cancelled so we must wait for them to complete.  We will get a special
 351                  * wake up call on our channel used to sleep for ALL active requests to
 352                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 353                  * when we must wait for all active aio requests.
 354                  */
 355
 356                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 357                                   (int)p, fd, 0, 0, 0 );
 358
 359                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
 360         }
 361
 362         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 363                           (int)p, fd, 0, 0, 0 );
 364
 365         return;
 366
 367 } /* _aio_close */
 368
 369
 370 /*
 371  * aio_error - return the error status associated with the async IO
 372  * request referred to by uap->aiocbp.  The error status is the errno
 373  * value that would be set by the corresponding IO request (read, wrtie,
 374  * fdatasync, or sync).
 375  */
 376
 377 int
 378 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
 379 {
 380         aio_workq_entry                         *entryp;
 381         int                                                     error;
 382
 383         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 384                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 385
 386         AIO_LOCK;
 387
 388         /* quick check to see if there are any async IO requests queued up */
 389         if ( aio_get_all_queues_count( ) < 1 ) {
 390                 error = EINVAL;
 391                 goto ExitRoutine;
 392         }
 393
 394         /* look for a match on our queue of async IO requests that have completed */
 395         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 396                 if ( entryp->uaiocbp == uap->aiocbp ) {
 397                         *retval = entryp->errorval;
 398                         error = 0;
 399                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 400                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 401                         goto ExitRoutine;
 402                 }
 403         }
 404
 405         /* look for a match on our queue of active async IO requests */
 406         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 407                 if ( entryp->uaiocbp == uap->aiocbp ) {
 408                         *retval = EINPROGRESS;
 409                         error = 0;
 410                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 411                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 412                         goto ExitRoutine;
 413                 }
 414         }
 415
 416         /* look for a match on our queue of todo work */
 417         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 418                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 419                         *retval = EINPROGRESS;
 420                         error = 0;
 421                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
 422                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 423                         goto ExitRoutine;
 424                 }
 425         }
 426         error = EINVAL;
 427
 428 ExitRoutine:
 429         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 430                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 431         AIO_UNLOCK;
 432
 433         return( error );
 434
 435 } /* aio_error */
 436
 437
 438 /*
 439  * aio_fsync - asynchronously force all IO operations associated
 440  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 441  * queued at the time of the call to the synchronized completion state.
 442  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 443  * fdatasync() call.
 444  */
 445
 446 int
 447 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
 448 {
 449         int                     error;
 450         int                     fsync_kind;
 451
 452         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 453                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 454
 455         *retval = 0;
 456         if ( uap->op == O_SYNC )
 457                 fsync_kind = AIO_FSYNC;
 458 #if 0 // we don't support fdatasync() call yet
 459         else if ( uap->op == O_DSYNC )
 460                 fsync_kind = AIO_DSYNC;
 461 #endif
 462         else {
 463                 *retval = -1;
 464                 error = EINVAL;
 465                 goto ExitRoutine;
 466         }
 467
 468         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 469         if ( error != 0 )
 470                 *retval = -1;
 471
 472 ExitRoutine:
 473         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 474                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 475
 476         return( error );
 477
 478 } /* aio_fsync */
 479
 480
 481 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 482  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 483  * (uap->aiocbp->aio_buf).
 484  */
 485
 486 int
 487 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
 488 {
 489         int                     error;
 490
 491         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 492                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 493
 494         *retval = 0;
 495
 496         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 497         if ( error != 0 )
 498                 *retval = -1;
 499
 500         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 501                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 502
 503         return( error );
 504
 505 } /* aio_read */
 506
 507
 508 /*
 509  * aio_return - return the return status associated with the async IO
 510  * request referred to by uap->aiocbp.  The return status is the value
 511  * that would be returned by corresponding IO request (read, wrtie,
 512  * fdatasync, or sync).  This is where we release kernel resources
 513  * held for async IO call associated with the given aiocb pointer.
 514  */
 515
 516 int
 517 aio_return( struct proc *p, struct aio_return_args *uap, register_t *retval )
 518 {
 519         aio_workq_entry                         *entryp;
 520         int                                                     error;
 521         boolean_t                                       lock_held;
 522
 523         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 524                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 525
 526         AIO_LOCK;
 527         lock_held = TRUE;
 528         *retval = 0;
 529
 530         /* quick check to see if there are any async IO requests queued up */
 531         if ( aio_get_all_queues_count( ) < 1 ) {
 532                 error = EINVAL;
 533                 goto ExitRoutine;
 534         }
 535
 536         /* look for a match on our queue of async IO requests that have completed */
 537         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 538                 if ( entryp->uaiocbp == uap->aiocbp ) {
 539                         TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 540                         aio_anchor.aio_done_count--;
 541                         p->aio_done_count--;
 542
 543                         *retval = entryp->returnval;
 544
 545                         /* we cannot free requests that are still completing */
 546                         if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 547                                 vm_map_t                my_map;
 548
 549                                 my_map = entryp->aio_map;
 550                                 entryp->aio_map = VM_MAP_NULL;
 551                                 AIO_UNLOCK;
 552                                 lock_held = FALSE;
 553                                 aio_free_request( entryp, my_map );
 554                         }
 555                         else
 556                                 /* tell completion code to free this request */
 557                                 entryp->flags |= AIO_DO_FREE;
 558                         error = 0;
 559                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 560                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 561                         goto ExitRoutine;
 562                 }
 563         }
 564
 565         /* look for a match on our queue of active async IO requests */
 566         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 567                 if ( entryp->uaiocbp == uap->aiocbp ) {
 568                         error = EINPROGRESS;
 569                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 570                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 571                         goto ExitRoutine;
 572                 }
 573         }
 574
 575         /* look for a match on our queue of todo work */
 576         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 577                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 578                         error = EINPROGRESS;
 579                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
 580                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 581                         goto ExitRoutine;
 582                 }
 583         }
 584         error = EINVAL;
 585
 586 ExitRoutine:
 587         if ( lock_held )
 588                 AIO_UNLOCK;
 589         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 590                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 591
 592         return( error );
 593
 594 } /* aio_return */
 595
 596
 597 /*
 598  * _aio_exec - internal function used to clean up async IO requests for
 599  * a process that is going away due to exec().  We cancel any async IOs
 600  * we can and wait for those already active.  We also disable signaling
 601  * for cancelled or active aio requests that complete.
 602  * NOTE - kernel funnel lock is held when we get called.
 603  * This routine MAY block!
 604  */
 605
 606 __private_extern__ void
 607 _aio_exec( struct proc *p )
 608 {
 609
 610         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 611                           (int)p, 0, 0, 0, 0 );
 612
 613         _aio_exit( p );
 614
 615         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 616                           (int)p, 0, 0, 0, 0 );
 617
 618         return;
 619
 620 } /* _aio_exec */
 621
 622
 623 /*
 624  * _aio_exit - internal function used to clean up async IO requests for
 625  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 626  * we can and wait for those already active.  We also disable signaling
 627  * for cancelled or active aio requests that complete.  This routine MAY block!
 628  * NOTE - kernel funnel lock is held when we get called.
 629  */
 630
 631 __private_extern__ void
 632 _aio_exit( struct proc *p )
 633 {
 634         int                                             error, count;
 635         aio_workq_entry                 *entryp;
 636
 637         /* quick check to see if there are any async IO requests queued up */
 638         AIO_LOCK;
 639         count = aio_get_all_queues_count( );
 640         AIO_UNLOCK;
 641         if ( count < 1 ) {
 642                 return;
 643         }
 644
 645         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 646                           (int)p, 0, 0, 0, 0 );
 647
 648         /*
 649          * cancel async IO requests on the todo work queue and wait for those
 650          * already active to complete.
 651          */
 652         error = do_aio_cancel( p, 0, NULL, TRUE, TRUE );
 653         if ( error == AIO_NOTCANCELED ) {
 654                 /*
 655                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 656                  * on the active async IO queue.  Active requests cannot be cancelled so we
 657                  * must wait for them to complete.  We will get a special wake up call on
 658                  * our channel used to sleep for ALL active requests to complete.  This sleep
 659                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 660                  * active aio requests.
 661                  */
 662
 663                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 664                                   (int)p, 0, 0, 0, 0 );
 665
 666                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
 667         }
 668
 669         /* release all aio resources used by this process */
 670         AIO_LOCK;
 671         entryp = TAILQ_FIRST( &p->aio_doneq );
 672         while ( entryp != NULL ) {
 673                 aio_workq_entry                 *next_entryp;
 674
 675                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 676                 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 677                 aio_anchor.aio_done_count--;
 678                 p->aio_done_count--;
 679
 680                 /* we cannot free requests that are still completing */
 681                 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 682                         vm_map_t                my_map;
 683
 684                         my_map = entryp->aio_map;
 685                         entryp->aio_map = VM_MAP_NULL;
 686                         AIO_UNLOCK;
 687                         aio_free_request( entryp, my_map );
 688
 689                         /* need to start over since aio_doneq may have been */
 690                         /* changed while we were away.  */
 691                         AIO_LOCK;
 692                         entryp = TAILQ_FIRST( &p->aio_doneq );
 693                         continue;
 694                 }
 695                 else
 696                         /* tell completion code to free this request */
 697                         entryp->flags |= AIO_DO_FREE;
 698                 entryp = next_entryp;
 699         }
 700         AIO_UNLOCK;
 701
 702 ExitRoutine:
 703         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 704                           (int)p, 0, 0, 0, 0 );
 705
 706         return;
 707
 708 } /* _aio_exit */
 709
 710
 711 /*
 712  * do_aio_cancel - cancel async IO requests (if possible).  We get called by
 713  * aio_cancel, close, and at exit.
 714  * There are three modes of operation: 1) cancel all async IOs for a process -
 715  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 716  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 717  * aiocbp.
 718  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 719  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 720  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 721  * were already complete.
 722  * WARNING - do not deference aiocbp in this routine, it may point to user
 723  * land data that has not been copied in (when called from aio_cancel() )
 724  * NOTE - kernel funnel lock is held when we get called.
 725  */
 726
 727 static int
 728 do_aio_cancel(  struct proc *p, int fd, struct aiocb *aiocbp,
 729                                 boolean_t wait_for_completion, boolean_t disable_notification )
 730 {
 731         aio_workq_entry                 *entryp;
 732         int                                             result;
 733
 734         result = -1;
 735
 736         /* look for a match on our queue of async todo work. */
 737         AIO_LOCK;
 738         entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 739         while ( entryp != NULL ) {
 740                 aio_workq_entry                 *next_entryp;
 741
 742                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 743                 if ( p == entryp->procp ) {
 744                         if ( (aiocbp == NULL && fd == 0) ||
 745                                  (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 746                                  (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 747                                 /* we found a match so we remove the entry from the */
 748                                 /* todo work queue and place it on the done queue */
 749                                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 750                                 aio_anchor.aio_async_workq_count--;
 751                                 entryp->errorval = ECANCELED;
 752                                 entryp->returnval = -1;
 753                                 if ( disable_notification )
 754                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 755                                 result = AIO_CANCELED;
 756
 757                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
 758                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 759
 760                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 761                                 aio_anchor.aio_done_count++;
 762                                 p->aio_done_count++;
 763                                 entryp->flags |= AIO_COMPLETION;
 764                                 AIO_UNLOCK;
 765
 766                                 /* do completion processing for this request */
 767                                 do_aio_completion( entryp );
 768
 769                                 AIO_LOCK;
 770                                 entryp->flags &= ~AIO_COMPLETION;
 771                                 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
 772                                         vm_map_t                my_map;
 773
 774                                         my_map = entryp->aio_map;
 775                                         entryp->aio_map = VM_MAP_NULL;
 776                                         AIO_UNLOCK;
 777                                         aio_free_request( entryp, my_map );
 778                                 }
 779                                 else
 780                                         AIO_UNLOCK;
 781
 782                                 if ( aiocbp != NULL ) {
 783                                         return( result );
 784                                 }
 785
 786                                 /* need to start over since aio_async_workq may have been */
 787                                 /* changed while we were away doing completion processing.  */
 788                                 AIO_LOCK;
 789                                 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 790                                 continue;
 791                         }
 792                 }
 793                 entryp = next_entryp;
 794         } /* while... */
 795
 796         /*
 797          * look for a match on our queue of synchronous todo work.  This will
 798          * be a rare occurrence but could happen if a process is terminated while
 799          * processing a lio_listio call.
 800          */
 801         entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 802         while ( entryp != NULL ) {
 803                 aio_workq_entry                 *next_entryp;
 804
 805                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 806                 if ( p == entryp->procp ) {
 807                         if ( (aiocbp == NULL && fd == 0) ||
 808                                  (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 809                                  (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 810                                 /* we found a match so we remove the entry from the */
 811                                 /* todo work queue and place it on the done queue */
 812                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 813                                 aio_anchor.lio_sync_workq_count--;
 814                                 entryp->errorval = ECANCELED;
 815                                 entryp->returnval = -1;
 816                                 if ( disable_notification )
 817                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 818                                 result = AIO_CANCELED;
 819
 820                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
 821                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 822
 823                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 824                                 aio_anchor.aio_done_count++;
 825                                 p->aio_done_count++;
 826                                 if ( aiocbp != NULL ) {
 827                                         AIO_UNLOCK;
 828                                         return( result );
 829                                 }
 830                         }
 831                 }
 832                 entryp = next_entryp;
 833         } /* while... */
 834
 835         /*
 836          * look for a match on our queue of active async IO requests and
 837          * return AIO_NOTCANCELED result.
 838          */
 839         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 840                 if ( (aiocbp == NULL && fd == 0) ||
 841                          (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 842                          (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 843                         result = AIO_NOTCANCELED;
 844
 845                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
 846                                                   (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 847
 848                         if ( wait_for_completion )
 849                                 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
 850                         if ( disable_notification )
 851                                 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 852                         if ( aiocbp != NULL ) {
 853                                 AIO_UNLOCK;
 854                                 return( result );
 855                         }
 856                 }
 857         }
 858
 859         /*
 860          * if we didn't find any matches on the todo or active queues then look for a
 861          * match on our queue of async IO requests that have completed and if found
 862          * return AIO_ALLDONE result.
 863          */
 864         if ( result == -1 ) {
 865                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 866                 if ( (aiocbp == NULL && fd == 0) ||
 867                          (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 868                          (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 869                                 result = AIO_ALLDONE;
 870
 871                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
 872                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 873
 874                                 if ( aiocbp != NULL ) {
 875                                         AIO_UNLOCK;
 876                                         return( result );
 877                                 }
 878                         }
 879                 }
 880         }
 881         AIO_UNLOCK;
 882
 883         return( result );
 884
 885 } /* do_aio_cancel */
 886
 887
 888 /*
 889  * aio_suspend - suspend the calling thread until at least one of the async
 890  * IO operations referenced by uap->aiocblist has completed, until a signal
 891  * interrupts the function, or uap->timeoutp time interval (optional) has
 892  * passed.
 893  * Returns 0 if one or more async IOs have completed else -1 and errno is
 894  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
 895  * woke us up.
 896  */
 897
 898 int
 899 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
 900 {
 901         int                                     error;
 902         int                                     i, count;
 903         uint64_t                        abstime;
 904         struct timespec         ts;
 905         struct timeval          tv;
 906         aio_workq_entry         *entryp;
 907         struct aiocb *          *aiocbpp;
 908
 909         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
 910                           (int)p, uap->nent, 0, 0, 0 );
 911
 912         *retval = -1;
 913         abstime = 0;
 914         aiocbpp = NULL;
 915
 916         /* quick check to see if there are any async IO requests queued up */
 917         AIO_LOCK;
 918         count = aio_get_all_queues_count( );
 919         AIO_UNLOCK;
 920         if ( count < 1 ) {
 921                 error = EINVAL;
 922                 goto ExitThisRoutine;
 923         }
 924
 925         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
 926                 error = EINVAL;
 927                 goto ExitThisRoutine;
 928         }
 929
 930         if ( uap->timeoutp != NULL ) {
 931                 error = copyin( (void *)uap->timeoutp, &ts, sizeof(ts) );
 932                 if ( error != 0 ) {
 933                         error = EAGAIN;
 934                         goto ExitThisRoutine;
 935                 }
 936
 937                 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
 938                         error = EINVAL;
 939                         goto ExitThisRoutine;
 940                 }
 941
 942                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
 943                                                                          &abstime );
 944                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
 945         }
 946
 947         MALLOC( aiocbpp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
 948         if ( aiocbpp == NULL ) {
 949                 error = EAGAIN;
 950                 goto ExitThisRoutine;
 951         }
 952
 953         /* check list of aio requests to see if any have completed */
 954         for ( i = 0; i < uap->nent; i++ ) {
 955                 struct aiocb    *aiocbp;
 956
 957                 /* copyin in aiocb pointer from list */
 958                 error = copyin( (void *)(uap->aiocblist + i), (aiocbpp + i), sizeof(aiocbp) );
 959                 if ( error != 0 ) {
 960                         error = EAGAIN;
 961                         goto ExitThisRoutine;
 962                 }
 963
 964                 /* NULL elements are legal so check for 'em */
 965                 aiocbp = *(aiocbpp + i);
 966                 if ( aiocbp == NULL )
 967                         continue;
 968
 969                 /* return immediately if any aio request in the list is done */
 970                 AIO_LOCK;
 971                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 972                         if ( entryp->uaiocbp == aiocbp ) {
 973                                 *retval = 0;
 974                                 error = 0;
 975                                 AIO_UNLOCK;
 976                                 goto ExitThisRoutine;
 977                         }
 978                 }
 979                 AIO_UNLOCK;
 980         } /* for ( ; i < uap->nent; ) */
 981
 982         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
 983                           (int)p, uap->nent, 0, 0, 0 );
 984
 985         /*
 986          * wait for an async IO to complete or a signal fires or timeout expires.
 987          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
 988          * interrupts us.  If an async IO completes before a signal fires or our
 989          * timeout expires, we get a wakeup call from aio_work_thread().  We do not
 990          * use tsleep() here in order to avoid getting kernel funnel lock.
 991          */
 992         assert_wait( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE );
 993         if ( abstime > 0 ) {
 994                 thread_set_timer_deadline( abstime );
 995         }
 996         error = thread_block( THREAD_CONTINUE_NULL );
 997         if ( error == THREAD_AWAKENED ) {
 998                 /* got our wakeup call from aio_work_thread() */
 999                 if ( abstime > 0 ) {
1000                         thread_cancel_timer();
1001                 }
1002                 *retval = 0;
1003                 error = 0;
1004         }
1005         else if ( error == THREAD_TIMED_OUT ) {
1006                 /* our timeout expired */
1007                 error = EAGAIN;
1008         }
1009         else {
1010                 /* we were interrupted */
1011                 if ( abstime > 0 ) {
1012                         thread_cancel_timer();
1013                 }
1014                 error = EINTR;
1015         }
1016
1017 ExitThisRoutine:
1018         if ( aiocbpp != NULL )
1019                 FREE( aiocbpp, M_TEMP );
1020
1021         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1022                           (int)p, uap->nent, error, 0, 0 );
1023
1024         return( error );
1025
1026 } /* aio_suspend */
1027
1028
1029 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1030  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1031  * (uap->aiocbp->aio_buf).
1032  */
1033
1034 int
1035 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1036 {
1037         int                     error;
1038
1039         *retval = 0;
1040
1041         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1042                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1043
1044         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1045         if ( error != 0 )
1046                 *retval = -1;
1047
1048         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1049                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1050
1051         return( error );
1052
1053 } /* aio_write */
1054
1055
1056 /*
1057  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs
1058  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1059  * The caller gets error and return status for each aiocb in the list via aio_error
1060  * and aio_return.  We must keep completed requests until released by the
1061  * aio_return call.
1062  */
1063
1064 int
1065 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1066 {
1067         int                                                     i;
1068         int                                                     call_result;
1069         int                                                     result;
1070         long                                            group_tag;
1071         aio_workq_entry *                       *entryp_listp;
1072
1073         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1074                           (int)p, uap->nent, uap->mode, 0, 0 );
1075
1076         entryp_listp = NULL;
1077         call_result = -1;
1078         *retval = -1;
1079         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1080                 call_result = EINVAL;
1081                 goto ExitRoutine;
1082         }
1083
1084         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1085                 call_result = EINVAL;
1086                 goto ExitRoutine;
1087         }
1088
1089         /*
1090          * we use group_tag to mark IO requests for delayed completion processing
1091          * which means we wait until all IO requests in the group have completed
1092          * before we either return to the caller when mode is LIO_WAIT or signal
1093          * user when mode is LIO_NOWAIT.
1094          */
1095         group_tag = random();
1096
1097         /*
1098          * allocate a list of aio_workq_entry pointers that we will use to queue
1099          * up all our requests at once while holding our lock.
1100          */
1101         MALLOC( entryp_listp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
1102         if ( entryp_listp == NULL ) {
1103                 call_result = EAGAIN;
1104                 goto ExitRoutine;
1105         }
1106
1107         /* process list of aio requests */
1108         for ( i = 0; i < uap->nent; i++ ) {
1109                 struct aiocb    *my_aiocbp;
1110
1111                 *(entryp_listp + i) = NULL;
1112
1113                 /* copyin in aiocb pointer from list */
1114                 result = copyin( (void *)(uap->aiocblist + i), &my_aiocbp, sizeof(my_aiocbp) );
1115                 if ( result != 0 ) {
1116                         call_result = EAGAIN;
1117                         continue;
1118                 }
1119
1120                 /* NULL elements are legal so check for 'em */
1121                 if ( my_aiocbp == NULL )
1122                         continue;
1123
1124                 if ( uap->mode == LIO_NOWAIT )
1125                         result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1126                                                                                          group_tag, (entryp_listp + i) );
1127                 else
1128                         result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1129                                                                                         (entryp_listp + i) );
1130
1131                 if ( result != 0 && call_result == -1 )
1132                         call_result = result;
1133         }
1134
1135         /*
1136          * we need to protect this section since we do not want any of these grouped
1137          * IO requests to begin until we have them all on the queue.
1138          */
1139         AIO_LOCK;
1140         for ( i = 0; i < uap->nent; i++ ) {
1141                 aio_workq_entry                         *entryp;
1142
1143                 /* NULL elements are legal so check for 'em */
1144                 entryp = *(entryp_listp + i);
1145                 if ( entryp == NULL )
1146                         continue;
1147
1148                 /* check our aio limits to throttle bad or rude user land behavior */
1149                 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1150                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1151                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1152                         vm_map_t                my_map;
1153
1154                         my_map = entryp->aio_map;
1155                         entryp->aio_map = VM_MAP_NULL;
1156                         result = EAGAIN;
1157                         AIO_UNLOCK;
1158                         aio_free_request( entryp, my_map );
1159                         AIO_LOCK;
1160                         continue;
1161                 }
1162
1163                 /* place the request on the appropriate queue */
1164                 if ( uap->mode == LIO_NOWAIT ) {
1165                         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1166                         aio_anchor.aio_async_workq_count++;
1167
1168                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1169                                           (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1170                 }
1171                 else {
1172                         TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1173                         aio_anchor.lio_sync_workq_count++;
1174                 }
1175         }
1176         AIO_UNLOCK;
1177
1178         if ( uap->mode == LIO_NOWAIT )
1179                 /* caller does not want to wait so we'll fire off a worker thread and return */
1180                 wakeup_one( &aio_anchor.aio_async_workq );
1181         else {
1182                 aio_workq_entry                 *entryp;
1183                 int                                     error;
1184
1185                 /*
1186                  * mode is LIO_WAIT - handle the IO requests now.
1187                  */
1188                 AIO_LOCK;
1189                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1190                 while ( entryp != NULL ) {
1191                         if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1192                                 boolean_t       funnel_state;
1193
1194                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1195                                 aio_anchor.lio_sync_workq_count--;
1196                                 AIO_UNLOCK;
1197
1198                                 // file system IO code path requires kernel funnel lock
1199                                 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1200                                 if ( (entryp->flags & AIO_READ) != 0 ) {
1201                                         error = do_aio_read( entryp );
1202                                 }
1203                                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1204                                         error = do_aio_write( entryp );
1205                                 }
1206                                 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1207                                         error = do_aio_fsync( entryp );
1208                                 }
1209                                 else {
1210                                         printf( "%s - unknown aio request - flags 0x%02X \n",
1211                                                         __FUNCTION__, entryp->flags );
1212                                         error = EINVAL;
1213                                 }
1214                                 entryp->errorval = error;
1215                                 if ( error != 0 && call_result == -1 )
1216                                         call_result = EIO;
1217                                 (void) thread_funnel_set( kernel_flock, funnel_state );
1218
1219                                 AIO_LOCK;
1220                                 /* we're done with the IO request so move it on the done queue */
1221                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1222                                 aio_anchor.aio_done_count++;
1223                                 p->aio_done_count++;
1224
1225                                 /* need to start over since lio_sync_workq may have been changed while we */
1226                                 /* were away doing the IO.  */
1227                                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1228                                 continue;
1229                         } /* p == entryp->procp */
1230
1231                         entryp = TAILQ_NEXT( entryp, aio_workq_link );
1232         } /* while ( entryp != NULL ) */
1233                 AIO_UNLOCK;
1234         } /* uap->mode == LIO_WAIT */
1235
1236         /* call_result == -1 means we had no trouble queueing up requests */
1237         if ( call_result == -1 ) {
1238                 call_result = 0;
1239                 *retval = 0;
1240         }
1241
1242 ExitRoutine:
1243         if ( entryp_listp != NULL )
1244                 FREE( entryp_listp, M_TEMP );
1245
1246         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1247                           (int)p, call_result, 0, 0, 0 );
1248
1249         return( call_result );
1250
1251 } /* lio_listio */
1252
1253
1254 /*
1255  * aio worker thread.  this is where all the real work gets done.
1256  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1257  * after new work is queued up.
1258  */
1259
1260 static void
1261 aio_work_thread( void )
1262 {
1263         aio_workq_entry                 *entryp;
1264         struct uthread                  *uthread = (struct uthread *)get_bsdthread_info(current_act());
1265
1266         for( ;; ) {
1267                 entryp = aio_get_some_work();
1268         if ( entryp == NULL ) {
1269                 /*
1270                  * aio worker threads wait for some work to get queued up
1271                  * by aio_queue_async_request.  Once some work gets queued
1272                  * it will wake up one of these worker threads just before
1273                  * returning to our caller in user land.   We do not use
1274                          * tsleep() here in order to avoid getting kernel funnel lock.
1275                  */
1276                         assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1277                         thread_block( THREAD_CONTINUE_NULL );
1278
1279                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_wake)) | DBG_FUNC_NONE,
1280                                                   0, 0, 0, 0, 0 );
1281         }
1282                 else {
1283                         int                     error;
1284                         boolean_t               funnel_state;
1285                         vm_map_t                currentmap;
1286                         vm_map_t                oldmap = VM_MAP_NULL;
1287                         task_t                  oldaiotask = TASK_NULL;
1288
1289                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1290                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1291
1292                         /*
1293                          * Assume the target's address space identity for the duration
1294                          * of the IO.
1295                          */
1296                         funnel_state = thread_funnel_set( kernel_flock, TRUE );
1297
1298                         currentmap = get_task_map( (current_proc())->task );
1299                         if ( currentmap != entryp->aio_map ) {
1300                                 oldaiotask = uthread->uu_aio_task;
1301                                 uthread->uu_aio_task = entryp->procp->task;
1302                                 oldmap = vm_map_switch( entryp->aio_map );
1303                         }
1304
1305                         if ( (entryp->flags & AIO_READ) != 0 ) {
1306                                 error = do_aio_read( entryp );
1307                         }
1308                         else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1309                                 error = do_aio_write( entryp );
1310                         }
1311                         else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1312                                 error = do_aio_fsync( entryp );
1313                         }
1314                         else {
1315                                 printf( "%s - unknown aio request - flags 0x%02X \n",
1316                                                 __FUNCTION__, entryp->flags );
1317                                 error = EINVAL;
1318                         }
1319                         entryp->errorval = error;
1320                         if ( currentmap != entryp->aio_map ) {
1321                                 (void) vm_map_switch( oldmap );
1322                                 uthread->uu_aio_task = oldaiotask;
1323                         }
1324
1325                         /* we're done with the IO request so pop it off the active queue and */
1326                         /* push it on the done queue */
1327                         AIO_LOCK;
1328                         TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1329                         aio_anchor.aio_active_count--;
1330                         entryp->procp->aio_active_count--;
1331                         TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1332                         aio_anchor.aio_done_count++;
1333                         entryp->procp->aio_done_count++;
1334                         entryp->flags |= AIO_COMPLETION;
1335
1336                         /* remove our reference to the user land map. */
1337                         if ( VM_MAP_NULL != entryp->aio_map ) {
1338                                 vm_map_t                my_map;
1339
1340                                 my_map = entryp->aio_map;
1341                                 entryp->aio_map = VM_MAP_NULL;
1342                                 AIO_UNLOCK;  /* must unlock before calling vm_map_deallocate() */
1343                                 vm_map_deallocate( my_map );
1344                         }
1345                         else {
1346                                 AIO_UNLOCK;
1347                         }
1348
1349                         do_aio_completion( entryp );
1350                         (void) thread_funnel_set( kernel_flock, funnel_state );
1351
1352                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1353                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1354                                                   entryp->returnval, 0 );
1355
1356                         AIO_LOCK;
1357                         entryp->flags &= ~AIO_COMPLETION;
1358                         if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1359                                 vm_map_t                my_map;
1360
1361                                 my_map = entryp->aio_map;
1362                                 entryp->aio_map = VM_MAP_NULL;
1363                                 AIO_UNLOCK;
1364                                 aio_free_request( entryp, my_map );
1365                         }
1366                         else
1367                                 AIO_UNLOCK;
1368                 }
1369         } /* for ( ;; ) */
1370
1371         /* NOT REACHED */
1372
1373 } /* aio_work_thread */
1374
1375
1376 /*
1377  * aio_get_some_work - get the next async IO request that is ready to be executed.
1378  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1379  * IO requests at the time the aio_fsync call came in have completed.
1380  */
1381
1382 static aio_workq_entry *
1383 aio_get_some_work( void )
1384 {
1385         aio_workq_entry                         *entryp;
1386         int                                                     skip_count = 0;
1387
1388         /* pop some work off the work queue and add to our active queue */
1389         AIO_LOCK;
1390         for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1391                   entryp != NULL;
1392                   entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1393
1394                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1395                         /* leave aio_fsync calls on the work queue if there are IO */
1396                         /* requests on the active queue for the same file descriptor. */
1397                         if ( aio_delay_fsync_request( entryp ) ) {
1398
1399                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1400                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1401                                 continue;
1402                         }
1403                 }
1404                 break;
1405         }
1406
1407         if ( entryp != NULL ) {
1408                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1409                 aio_anchor.aio_async_workq_count--;
1410                 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1411                 aio_anchor.aio_active_count++;
1412                 entryp->procp->aio_active_count++;
1413         }
1414         AIO_UNLOCK;
1415
1416         return( entryp );
1417
1418 } /* aio_get_some_work */
1419
1420
1421 /*
1422  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1423  * this time.  Delay will happen when there are any active IOs for the same file
1424  * descriptor that were queued at time the aio_sync call was queued.
1425  * NOTE - AIO_LOCK must be held by caller
1426  */
1427 static boolean_t
1428 aio_delay_fsync_request( aio_workq_entry *entryp )
1429 {
1430         aio_workq_entry                 *my_entryp;
1431
1432         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1433                 if ( my_entryp->fsyncp != NULL &&
1434                          entryp->uaiocbp == my_entryp->fsyncp &&
1435                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1436                         return( TRUE );
1437                 }
1438         }
1439
1440         return( FALSE );
1441
1442 } /* aio_delay_fsync_request */
1443
1444
1445 /*
1446  * aio_queue_async_request - queue up an async IO request on our work queue then
1447  * wake up one of our worker threads to do the actual work.  We get a reference
1448  * to our caller's user land map in order to keep it around while we are
1449  * processing the request.
1450  */
1451
1452 static int
1453 aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO )
1454 {
1455         aio_workq_entry                 *entryp;
1456         int                                             result;
1457
1458         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1459         if ( entryp == NULL ) {
1460                 result = EAGAIN;
1461                 goto error_exit;
1462         }
1463         bzero( entryp, sizeof(*entryp) );
1464
1465         /* fill in the rest of the aio_workq_entry */
1466         entryp->procp = procp;
1467         entryp->uaiocbp = aiocbp;
1468         entryp->flags |= kindOfIO;
1469         entryp->aio_map = VM_MAP_NULL;
1470         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1471         if ( result != 0 ) {
1472                 result = EAGAIN;
1473                 goto error_exit;
1474         }
1475
1476         /* do some more validation on the aiocb and embedded file descriptor */
1477         result = aio_validate( entryp );
1478         if ( result != 0 )
1479                 goto error_exit;
1480
1481         /* get a reference to the user land map in order to keep it around */
1482         entryp->aio_map = get_task_map( procp->task );
1483         vm_map_reference( entryp->aio_map );
1484
1485         AIO_LOCK;
1486
1487         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1488                 AIO_UNLOCK;
1489                 result = EAGAIN;
1490                 goto error_exit;
1491         }
1492
1493         /* check our aio limits to throttle bad or rude user land behavior */
1494         if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1495                  aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1496                 AIO_UNLOCK;
1497                 result = EAGAIN;
1498                 goto error_exit;
1499         }
1500
1501         /*
1502          * aio_fsync calls sync up all async IO requests queued at the time
1503          * the aio_fsync call was made.  So we mark each currently queued async
1504          * IO with a matching file descriptor as must complete before we do the
1505          * fsync.  We set the fsyncp field of each matching async IO
1506          * request with the aiocb pointer passed in on the aio_fsync call to
1507          * know which IOs must complete before we process the aio_fsync call.
1508          */
1509         if ( (kindOfIO & AIO_FSYNC) != 0 )
1510                 aio_mark_requests( entryp );
1511
1512         /* queue up on our aio asynchronous work queue */
1513         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1514         aio_anchor.aio_async_workq_count++;
1515
1516         AIO_UNLOCK;
1517
1518         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1519                           (int)procp, (int)aiocbp, 0, 0, 0 );
1520
1521         wakeup_one( &aio_anchor.aio_async_workq );
1522
1523         return( 0 );
1524
1525 error_exit:
1526         if ( entryp != NULL ) {
1527                 /* this entry has not been queued up so no worries about unlocked */
1528                 /* state and aio_map */
1529                 aio_free_request( entryp, entryp->aio_map );
1530         }
1531
1532         return( result );
1533
1534 } /* aio_queue_async_request */
1535
1536
1537 /*
1538  * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1539  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1540  * our caller.  We get a reference to our caller's user land map in order to keep
1541  * it around while we are processing the request.
1542  * lio_listio calls behave differently at completion they do completion notification
1543  * when all async IO requests have completed.  We use group_tag to tag IO requests
1544  * that behave in the delay notification manner.
1545  */
1546
1547 static int
1548 lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp,
1549                                                  struct sigevent *sigp, long group_tag,
1550                                                  aio_workq_entry **entrypp )
1551 {
1552         aio_workq_entry                         *entryp;
1553         int                                                     result;
1554
1555         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1556         if ( entryp == NULL ) {
1557                 result = EAGAIN;
1558                 goto error_exit;
1559         }
1560         bzero( entryp, sizeof(*entryp) );
1561
1562         /* fill in the rest of the aio_workq_entry */
1563         entryp->procp = procp;
1564         entryp->uaiocbp = aiocbp;
1565         entryp->flags |= AIO_LIO;
1566         entryp->group_tag = group_tag;
1567         entryp->aio_map = VM_MAP_NULL;
1568         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1569         if ( result != 0 ) {
1570                 result = EAGAIN;
1571                 goto error_exit;
1572         }
1573
1574         /* look for lio_listio LIO_NOP requests and ignore them. */
1575         /* Not really an error, but we need to free our aio_workq_entry.  */
1576         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1577                 result = 0;
1578                 goto error_exit;
1579         }
1580
1581         /* use sigevent passed in to lio_listio for each of our calls, but only */
1582         /* do completion notification after the last request completes. */
1583         if ( sigp != NULL ) {
1584                 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1585                 if ( result != 0 ) {
1586                         result = EAGAIN;
1587                         goto error_exit;
1588                 }
1589         }
1590
1591         /* do some more validation on the aiocb and embedded file descriptor */
1592         result = aio_validate( entryp );
1593         if ( result != 0 )
1594                 goto error_exit;
1595
1596         /* get a reference to the user land map in order to keep it around */
1597         entryp->aio_map = get_task_map( procp->task );
1598         vm_map_reference( entryp->aio_map );
1599
1600         *entrypp = entryp;
1601         return( 0 );
1602
1603 error_exit:
1604         if ( entryp != NULL )
1605                 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1606
1607         return( result );
1608
1609 } /* lio_create_async_entry */
1610
1611
1612 /*
1613  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1614  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp
1615  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp
1616  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1617  * NOTE - AIO_LOCK must be held by caller
1618  */
1619
1620 static void
1621 aio_mark_requests( aio_workq_entry *entryp )
1622 {
1623         aio_workq_entry                 *my_entryp;
1624
1625         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1626                 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1627                         my_entryp->fsyncp = entryp->uaiocbp;
1628                 }
1629         }
1630
1631         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1632                 if ( entryp->procp == my_entryp->procp &&
1633                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1634                         my_entryp->fsyncp = entryp->uaiocbp;
1635                 }
1636         }
1637
1638 } /* aio_mark_requests */
1639
1640
1641 /*
1642  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1643  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1644  * our caller.
1645  * lio_listio calls behave differently at completion they do completion notification
1646  * when all async IO requests have completed.  We use group_tag to tag IO requests
1647  * that behave in the delay notification manner.
1648  */
1649
1650 static int
1651 lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp,
1652                                                 long group_tag, aio_workq_entry **entrypp )
1653 {
1654         aio_workq_entry                         *entryp;
1655         int                                                     result;
1656
1657         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1658         if ( entryp == NULL ) {
1659                 result = EAGAIN;
1660                 goto error_exit;
1661         }
1662         bzero( entryp, sizeof(*entryp) );
1663
1664         /* fill in the rest of the aio_workq_entry */
1665         entryp->procp = procp;
1666         entryp->uaiocbp = aiocbp;
1667         entryp->flags |= AIO_LIO;
1668         entryp->group_tag = group_tag;
1669         entryp->aio_map = VM_MAP_NULL;
1670         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1671         if ( result != 0 ) {
1672                 result = EAGAIN;
1673                 goto error_exit;
1674         }
1675
1676         /* look for lio_listio LIO_NOP requests and ignore them. */
1677         /* Not really an error, but we need to free our aio_workq_entry.  */
1678         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1679                 result = 0;
1680                 goto error_exit;
1681         }
1682
1683         result = aio_validate( entryp );
1684         if ( result != 0 ) {
1685                 goto error_exit;
1686         }
1687
1688         *entrypp = entryp;
1689         return( 0 );
1690
1691 error_exit:
1692         if ( entryp != NULL )
1693                 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1694
1695         return( result );
1696
1697 } /* lio_create_sync_entry */
1698
1699
1700 /*
1701  * aio_free_request - remove our reference on the user land map and
1702  * free the work queue entry resources.
1703  * We are not holding the lock here thus aio_map is passed in and
1704  * zeroed while we did have the lock.
1705  */
1706
1707 static int
1708 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1709 {
1710         /* remove our reference to the user land map. */
1711         if ( VM_MAP_NULL != the_map ) {
1712                 vm_map_deallocate( the_map );
1713         }
1714
1715         zfree( aio_workq_zonep, (vm_offset_t) entryp );
1716
1717         return( 0 );
1718
1719 } /* aio_free_request */
1720
1721
1722 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1723  */
1724
1725 static int
1726 aio_validate( aio_workq_entry *entryp )
1727 {
1728         boolean_t                                       funnel_state;
1729         struct file                             *fp;
1730         int                                                     flag;
1731         int                                                     result;
1732
1733         result = 0;
1734
1735         if ( (entryp->flags & AIO_LIO) != 0 ) {
1736                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1737                         entryp->flags |= AIO_READ;
1738                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1739                         entryp->flags |= AIO_WRITE;
1740                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1741                         return( 0 );
1742                 else
1743                         return( EINVAL );
1744         }
1745
1746         flag = FREAD;
1747         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1748                 flag = FWRITE;
1749         }
1750
1751         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1752                 if ( entryp->aiocb.aio_offset < 0                       ||
1753                          entryp->aiocb.aio_nbytes < 0                   ||
1754                          entryp->aiocb.aio_nbytes > INT_MAX     ||
1755                          entryp->aiocb.aio_buf == NULL )
1756                         return( EINVAL );
1757         }
1758
1759         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify
1760          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value,
1761          * sigev_notify_function, and sigev_notify_attributes are ignored.
1762          */
1763         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1764                 int             signum;
1765                 /* make sure we have a valid signal number */
1766                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1767                 if ( signum <= 0 || signum >= NSIG ||
1768                          signum == SIGKILL || signum == SIGSTOP )
1769                         return (EINVAL);
1770         }
1771         else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1772                 return (EINVAL);
1773
1774         /* validate the file descriptor and that the file was opened
1775          * for the appropriate read / write access.  This section requires
1776          * kernel funnel lock.
1777          */
1778         funnel_state = thread_funnel_set( kernel_flock, TRUE );
1779
1780         result = fdgetf( entryp->procp, entryp->aiocb.aio_fildes, &fp );
1781         if ( result == 0 ) {
1782                 if ( (fp->f_flag & flag) == 0 ) {
1783                         /* we don't have read or write access */
1784                         result = EBADF;
1785                 }
1786                 else if ( fp->f_type != DTYPE_VNODE ) {
1787                         /* this is not a file */
1788                         result = ESPIPE;
1789                 }
1790         }
1791         else {
1792                 result = EBADF;
1793         }
1794
1795         (void) thread_funnel_set( kernel_flock, funnel_state );
1796
1797         return( result );
1798
1799 } /* aio_validate */
1800
1801
1802 /*
1803  * aio_get_process_count - runs through our queues that hold outstanding
1804  * async IO reqests and totals up number of requests for the given
1805  * process.
1806  * NOTE - caller must hold aio lock!
1807  */
1808
1809 static int
1810 aio_get_process_count( struct proc *procp )
1811 {
1812         aio_workq_entry                         *entryp;
1813         int                                                     error;
1814         int                                                     count;
1815
1816         /* begin with count of completed async IO requests for this process */
1817         count = procp->aio_done_count;
1818
1819         /* add in count of active async IO requests for this process */
1820         count += procp->aio_active_count;
1821
1822         /* look for matches on our queue of asynchronous todo work */
1823         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1824                 if ( procp == entryp->procp ) {
1825                         count++;
1826                 }
1827         }
1828
1829         /* look for matches on our queue of synchronous todo work */
1830         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1831                 if ( procp == entryp->procp ) {
1832                         count++;
1833                 }
1834         }
1835
1836         return( count );
1837
1838 } /* aio_get_process_count */
1839
1840
1841 /*
1842  * aio_get_all_queues_count - get total number of entries on all aio work queues.
1843  * NOTE - caller must hold aio lock!
1844  */
1845
1846 static int
1847 aio_get_all_queues_count( void )
1848 {
1849         int                                                     count;
1850
1851         count = aio_anchor.aio_async_workq_count;
1852         count += aio_anchor.lio_sync_workq_count;
1853         count += aio_anchor.aio_active_count;
1854         count += aio_anchor.aio_done_count;
1855
1856         return( count );
1857
1858 } /* aio_get_all_queues_count */
1859
1860
1861 /*
1862  * do_aio_completion.  Handle async IO completion.
1863  */
1864
1865 static void
1866 do_aio_completion( aio_workq_entry *entryp )
1867 {
1868         /* signal user land process if appropriate */
1869         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1870                  (entryp->flags & AIO_DISABLE) == 0 ) {
1871
1872                 /*
1873                  * if group_tag is non zero then make sure this is the last IO request
1874                  * in the group before we signal.
1875                  */
1876                 if ( entryp->group_tag == 0 ||
1877                          (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1878                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1879                                                   (int)entryp->procp, (int)entryp->uaiocbp,
1880                                                   entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1881
1882                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1883                         return;
1884                 }
1885         }
1886
1887         /*
1888          * need to handle case where a process is trying to exit, exec, or close
1889          * and is currently waiting for active aio requests to complete.  If
1890          * AIO_WAITING is set then we need to look to see if there are any
1891          * other requests in the active queue for this process.  If there are
1892          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If
1893          * there are some still active then do nothing - we only want to wakeup
1894          * when all active aio requests for the process are complete.
1895          */
1896         if ( (entryp->flags & AIO_WAITING) != 0 ) {
1897                 int             active_requests;
1898
1899                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1900                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1901
1902                 AIO_LOCK;
1903                 active_requests = aio_active_requests_for_process( entryp->procp );
1904                 AIO_UNLOCK;
1905                 if ( active_requests < 1 ) {
1906                         /* no active aio requests for this process, continue exiting */
1907
1908                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1909                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1910
1911                         wakeup_one( &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1912                 }
1913                 return;
1914         }
1915
1916         /*
1917          * aio_suspend case when a signal was not requested.  In that scenario we
1918          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1919          * NOTE - the assumption here is that this wakeup call is inexpensive.
1920          * we really only need to do this when an aio_suspend call is pending.
1921          * If we find the wakeup call should be avoided we could mark the
1922          * async IO requests given in the list provided by aio_suspend and only
1923          * call wakeup for them.  If we do mark them we should unmark them after
1924          * the aio_suspend wakes up.
1925          */
1926         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1927                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1928
1929         wakeup_one( &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1930
1931         return;
1932
1933 } /* do_aio_completion */
1934
1935
1936 /*
1937  * aio_last_group_io - checks to see if this is the last unfinished IO request
1938  * for the given group_tag.  Returns TRUE if there are no other active IO
1939  * requests for this group or FALSE if the are active IO requests
1940  * NOTE - AIO_LOCK must be held by caller
1941  */
1942
1943 static boolean_t
1944 aio_last_group_io( aio_workq_entry *entryp )
1945 {
1946         aio_workq_entry                         *my_entryp;
1947
1948         /* look for matches on our queue of active async IO requests */
1949         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1950                 if ( my_entryp->group_tag == entryp->group_tag )
1951                         return( FALSE );
1952         }
1953
1954         /* look for matches on our queue of asynchronous todo work */
1955         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1956                 if ( my_entryp->group_tag == entryp->group_tag )
1957                         return( FALSE );
1958         }
1959
1960         /* look for matches on our queue of synchronous todo work */
1961         TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1962                 if ( my_entryp->group_tag == entryp->group_tag )
1963                         return( FALSE );
1964         }
1965
1966         return( TRUE );
1967
1968 } /* aio_last_group_io */
1969
1970
1971 /*
1972  * do_aio_read
1973  */
1974 static int
1975 do_aio_read( aio_workq_entry *entryp )
1976 {
1977         struct file                     *fp;
1978         int                                             error;
1979
1980         fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FREAD );
1981         if ( fp != NULL ) {
1982                 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
1983                                                         (void *)entryp->aiocb.aio_buf,
1984                                                         entryp->aiocb.aio_nbytes,
1985                                                         entryp->aiocb.aio_offset, FOF_OFFSET,
1986                                                         &entryp->returnval );
1987                 frele( fp );
1988         }
1989         else
1990                 error = EBADF;
1991
1992         return( error );
1993
1994 } /* do_aio_read */
1995
1996
1997 /*
1998  * do_aio_write
1999  */
2000 static int
2001 do_aio_write( aio_workq_entry *entryp )
2002 {
2003         struct file                     *fp;
2004         int                                             error;
2005
2006         fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FWRITE );
2007         if ( fp != NULL ) {
2008                 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes,
2009                                                          (const void *)entryp->aiocb.aio_buf,
2010                                                          entryp->aiocb.aio_nbytes,
2011                                                          entryp->aiocb.aio_offset, FOF_OFFSET,
2012                                                          &entryp->returnval );
2013                 frele( fp );
2014         }
2015         else
2016                 error = EBADF;
2017
2018         return( error );
2019
2020 } /* do_aio_write */
2021
2022
2023 /*
2024  * aio_active_requests_for_process - return number of active async IO
2025  * requests for the given process.
2026  * NOTE - caller must hold aio lock!
2027  */
2028
2029 static int
2030 aio_active_requests_for_process( struct proc *procp )
2031 {
2032
2033         return( procp->aio_active_count );
2034
2035 } /* aio_active_requests_for_process */
2036
2037
2038 /*
2039  * do_aio_fsync
2040  */
2041 static int
2042 do_aio_fsync( aio_workq_entry *entryp )
2043 {
2044         register struct vnode   *vp;
2045         struct file                     *fp;
2046         int                                             error;
2047
2048         /*
2049          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2050          * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2051          * The following was shamelessly extracted from fsync() implementation.
2052          */
2053         error = getvnode( entryp->procp, entryp->aiocb.aio_fildes, &fp );
2054         if ( error == 0 ) {
2055                 vp = (struct vnode *)fp->f_data;
2056                 vn_lock( vp, LK_EXCLUSIVE | LK_RETRY, entryp->procp );
2057                 error = VOP_FSYNC( vp, fp->f_cred, MNT_WAIT, entryp->procp );
2058                 VOP_UNLOCK( vp, 0, entryp->procp );
2059         }
2060         if ( error != 0 )
2061                 entryp->returnval = -1;
2062
2063         return( error );
2064
2065 } /* do_aio_fsync */
2066
2067
2068 /*
2069  * is_already_queued - runs through our queues to see if the given
2070  * aiocbp / process is there.  Returns TRUE if there is a match
2071  * on any of our aio queues.
2072  * NOTE - callers must hold aio lock!
2073  */
2074
2075 static boolean_t
2076 is_already_queued(      struct proc *procp,
2077                                         struct aiocb *aiocbp )
2078 {
2079         aio_workq_entry                 *entryp;
2080         boolean_t                               result;
2081
2082         result = FALSE;
2083
2084         /* look for matches on our queue of async IO requests that have completed */
2085         TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2086                 if ( aiocbp == entryp->uaiocbp ) {
2087                         result = TRUE;
2088                         goto ExitThisRoutine;
2089                 }
2090         }
2091
2092         /* look for matches on our queue of active async IO requests */
2093         TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2094                 if ( aiocbp == entryp->uaiocbp ) {
2095                         result = TRUE;
2096                         goto ExitThisRoutine;
2097                 }
2098         }
2099
2100         /* look for matches on our queue of asynchronous todo work */
2101         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2102                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2103                         result = TRUE;
2104                         goto ExitThisRoutine;
2105                 }
2106         }
2107
2108         /* look for matches on our queue of synchronous todo work */
2109         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2110                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2111                         result = TRUE;
2112                         goto ExitThisRoutine;
2113                 }
2114         }
2115
2116 ExitThisRoutine:
2117         return( result );
2118
2119 } /* is_already_queued */
2120
2121
2122 /*
2123  * aio initialization
2124  */
2125 __private_extern__ void
2126 aio_init( void )
2127 {
2128         int                     i;
2129
2130         simple_lock_init( &aio_lock );
2131
2132         AIO_LOCK;
2133         TAILQ_INIT( &aio_anchor.aio_async_workq );
2134         TAILQ_INIT( &aio_anchor.lio_sync_workq );
2135         aio_anchor.aio_async_workq_count = 0;
2136         aio_anchor.lio_sync_workq_count = 0;
2137         aio_anchor.aio_active_count = 0;
2138         aio_anchor.aio_done_count = 0;
2139         AIO_UNLOCK;
2140
2141         i = sizeof( aio_workq_entry );
2142         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2143
2144         _aio_create_worker_threads( aio_worker_threads );
2145
2146         return;
2147
2148 } /* aio_init */
2149
2150
2151 /*
2152  * aio worker threads created here.
2153  */
2154 __private_extern__ void
2155 _aio_create_worker_threads( int num )
2156 {
2157         int                     i;
2158
2159         /* create some worker threads to handle the async IO requests */
2160         for ( i = 0; i < num; i++ ) {
2161                 thread_t                myThread;
2162
2163                 myThread = kernel_thread( kernel_task, aio_work_thread );
2164                 if ( THREAD_NULL == myThread ) {
2165                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2166                 }
2167         }
2168
2169         return;
2170
2171 } /* _aio_create_worker_threads */
2172
2173 /*
2174  * Return the current activation utask
2175  */
2176 task_t
2177 get_aiotask(void)
2178 {
2179         return  ((struct uthread *)get_bsdthread_info(current_act()))->uu_aio_task;
2180 }