bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25
  26
  27 /*
  28  * todo:
  29  *              1) ramesh is looking into how to replace taking a reference on
  30  *                      the user's map (vm_map_reference()) since it is believed that
  31  *                      would not hold the process for us.
  32  *              2) david is looking into a way for us to set the priority of the
  33  *                      worker threads to match that of the user's thread when the
  34  *                      async IO was queued.
  35  */
  36
  37
  38 /*
  39  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  40  */
  41
  42 #include <sys/systm.h>
  43 #include <sys/buf.h>
  44 #include <sys/fcntl.h>
  45 #include <sys/file.h>
  46 #include <sys/filedesc.h>
  47 #include <sys/kernel.h>
  48 #include <sys/vnode.h>
  49 #include <sys/malloc.h>
  50 #include <sys/mount.h>
  51 #include <sys/param.h>
  52 #include <sys/proc.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/unistd.h>
  55 #include <sys/user.h>
  56
  57 #include <sys/aio_kern.h>
  58
  59 #include <machine/limits.h>
  60 #include <kern/zalloc.h>
  61 #include <kern/task.h>
  62
  63 #include <sys/kdebug.h>
  64 #define AIO_work_queued                                 1
  65 #define AIO_worker_wake                                 2
  66 #define AIO_completion_sig                              3
  67 #define AIO_completion_cleanup_wait             4
  68 #define AIO_completion_cleanup_wake             5
  69 #define AIO_completion_suspend_wake     6
  70 #define AIO_fsync_delay                                 7
  71 #define AIO_cancel                                              10
  72 #define AIO_cancel_async_workq                  11
  73 #define AIO_cancel_sync_workq                   12
  74 #define AIO_cancel_activeq                              13
  75 #define AIO_cancel_doneq                                14
  76 #define AIO_fsync                                               20
  77 #define AIO_read                                                30
  78 #define AIO_write                                               40
  79 #define AIO_listio                                              50
  80 #define AIO_error                                               60
  81 #define AIO_error_val                                   61
  82 #define AIO_error_activeq                               62
  83 #define AIO_error_workq                                 63
  84 #define AIO_return                                              70
  85 #define AIO_return_val                                  71
  86 #define AIO_return_activeq                              72
  87 #define AIO_return_workq                                73
  88 #define AIO_exec                                                80
  89 #define AIO_exit                                                90
  90 #define AIO_exit_sleep                                  91
  91 #define AIO_close                                               100
  92 #define AIO_close_sleep                                 101
  93 #define AIO_suspend                                             110
  94 #define AIO_suspend_sleep                               111
  95 #define AIO_worker_thread                               120
  96
  97 #if 0
  98 #undef KERNEL_DEBUG
  99 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 100 #endif
 101
 102 /*
 103  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 104  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 105  * (proc.aio_activeq) when one of our worker threads start the IO.
 106  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 107  * when the IO request completes.  The request remains on aio_doneq until
 108  * user process calls aio_return or the process exits, either way that is our
 109  * trigger to release aio resources.
 110  */
 111 struct aio_anchor_cb
 112 {
 113         int                                                                     aio_async_workq_count;  /* entries on aio_async_workq */
 114         int                                                                     lio_sync_workq_count;   /* entries on lio_sync_workq */
 115         int                                                                     aio_active_count;       /* entries on all active queues (proc.aio_activeq) */
 116         int                                                                     aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 117         TAILQ_HEAD( , aio_workq_entry )         aio_async_workq;
 118         TAILQ_HEAD( , aio_workq_entry )         lio_sync_workq;
 119 };
 120 typedef struct aio_anchor_cb aio_anchor_cb;
 121
 122
 123 /*
 124  * Notes on aio sleep / wake channels.
 125  * We currently pick a couple fields within the proc structure that will allow
 126  * us sleep channels that currently do not collide with any other kernel routines.
 127  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 128  */
 129 #define AIO_SUSPEND_SLEEP_CHAN  p_estcpu
 130 #define AIO_CLEANUP_SLEEP_CHAN  p_pctcpu
 131
 132
 133 /*
 134  * aysnc IO locking macros used to protect critical sections.
 135  */
 136 #define AIO_LOCK        usimple_lock( &aio_lock )
 137 #define AIO_UNLOCK      usimple_unlock( &aio_lock )
 138
 139
 140 /*
 141  *  LOCAL PROTOTYPES
 142  */
 143 static int                      aio_active_requests_for_process( struct proc *procp );
 144 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 145 static int                      aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
 146 static int                      aio_get_all_queues_count( void );
 147 static int                      aio_get_process_count( struct proc *procp );
 148 static aio_workq_entry *  aio_get_some_work( void );
 149 static boolean_t        aio_last_group_io( aio_workq_entry *entryp );
 150 static void                     aio_mark_requests( aio_workq_entry *entryp );
 151 static int                      aio_queue_async_request( struct proc *procp,
 152                                                                                          struct aiocb *aiocbp,
 153                                                                                          int kindOfIO );
 154 static int                      aio_validate( aio_workq_entry *entryp );
 155 static void                     aio_work_thread( void );
 156 static int                      do_aio_cancel(  struct proc *p,
 157                                                                         int fd,
 158                                                                         struct aiocb *aiocbp,
 159                                                                         boolean_t wait_for_completion,
 160                                                                         boolean_t disable_notification );
 161 static void                     do_aio_completion( aio_workq_entry *entryp );
 162 static int                      do_aio_fsync( aio_workq_entry *entryp );
 163 static int                      do_aio_read( aio_workq_entry *entryp );
 164 static int                      do_aio_write( aio_workq_entry *entryp );
 165 static boolean_t        is_already_queued(      struct proc *procp,
 166                                                                                 struct aiocb *aiocbp );
 167 static int                      lio_create_async_entry( struct proc *procp,
 168                                                                                          struct aiocb *aiocbp,
 169                                                                                          struct sigevent *sigp,
 170                                                                                          long group_tag,
 171                                                                                          aio_workq_entry **entrypp );
 172 static int                      lio_create_sync_entry( struct proc *procp,
 173                                                                                         struct aiocb *aiocbp,
 174                                                                                         long group_tag,
 175                                                                                         aio_workq_entry **entrypp );
 176
 177 /*
 178  *  EXTERNAL PROTOTYPES
 179  */
 180
 181 /* in ...bsd/kern/sys_generic.c */
 182 extern struct file*     holdfp( struct filedesc* fdp, int fd, int flag );
 183 extern int                      dofileread( struct proc *p, struct file *fp, int fd,
 184                                                                 void *buf, size_t nbyte, off_t offset,
 185                                                                 int flags, int *retval );
 186 extern int                      dofilewrite( struct proc *p, struct file *fp, int fd,
 187                                                                  const void *buf, size_t nbyte, off_t offset,
 188                                                                  int flags, int *retval );
 189 extern vm_map_t         vm_map_switch( vm_map_t    map );
 190
 191
 192 /*
 193  * aio external global variables.
 194  */
 195 extern int aio_max_requests;                            /* AIO_MAX - configurable */
 196 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 197 extern int aio_worker_threads;                          /* AIO_THREAD_COUNT - configurable */
 198
 199
 200 /*
 201  * aio static variables.
 202  */
 203 static aio_anchor_cb            aio_anchor;
 204 static simple_lock_data_t       aio_lock;
 205 static struct zone              *aio_workq_zonep;
 206
 207
 208 /*
 209  * syscall input parameters
 210  */
 211 #ifndef _SYS_SYSPROTO_H_
 212
 213 struct  aio_cancel_args {
 214         int                             fd;
 215         struct aiocb    *aiocbp;
 216 };
 217
 218 struct  aio_error_args {
 219         struct aiocb                    *aiocbp;
 220 };
 221
 222 struct  aio_fsync_args {
 223         int                                             op;
 224         struct aiocb                    *aiocbp;
 225 };
 226
 227 struct  aio_read_args {
 228         struct aiocb                    *aiocbp;
 229 };
 230
 231 struct  aio_return_args {
 232         struct aiocb    *aiocbp;
 233 };
 234
 235 struct  aio_suspend_args {
 236         struct aiocb *const     *aiocblist;
 237         int                                             nent;
 238         const struct timespec   *timeoutp;
 239 };
 240
 241 struct  aio_write_args {
 242         struct aiocb                    *aiocbp;
 243 };
 244
 245 struct  lio_listio_args {
 246         int                                             mode;
 247         struct aiocb *const     *aiocblist;
 248         int                                             nent;
 249         struct sigevent                 *sigp;
 250 };
 251
 252 #endif /* _SYS_SYSPROTO_H_ */
 253
 254
 255 /*
 256  * aio_cancel - attempt to cancel one or more async IO requests currently
 257  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 258  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 259  * is NULL then all outstanding async IO request for the given file
 260  * descriptor are cancelled (if possible).
 261  */
 262
 263 int
 264 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
 265 {
 266         struct aiocb                            my_aiocb;
 267         int                                                     result;
 268         boolean_t                                       funnel_state;
 269
 270         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 271                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 272
 273         /* quick check to see if there are any async IO requests queued up */
 274         AIO_LOCK;
 275         result = aio_get_all_queues_count( );
 276         AIO_UNLOCK;
 277         if ( result < 1 ) {
 278                 result = EBADF;
 279                 goto ExitRoutine;
 280         }
 281
 282         *retval = -1;
 283         if ( uap->aiocbp != NULL ) {
 284                 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
 285                 if ( result != 0 ) {
 286                         result = EAGAIN;
 287                         goto ExitRoutine;
 288                 }
 289
 290                 /* NOTE - POSIX standard says a mismatch between the file */
 291                 /* descriptor passed in and the file descriptor embedded in */
 292                 /* the aiocb causes unspecified results.  We return EBADF in */
 293                 /* that situation.  */
 294                 if ( uap->fd != my_aiocb.aio_fildes ) {
 295                         result = EBADF;
 296                         goto ExitRoutine;
 297                 }
 298         }
 299
 300         /* current BSD code assumes funnel lock is held */
 301         funnel_state = thread_funnel_set( kernel_flock, TRUE );
 302         result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
 303         (void) thread_funnel_set( kernel_flock, funnel_state );
 304
 305         if ( result != -1 ) {
 306                 *retval = result;
 307                 result = 0;
 308                 goto ExitRoutine;
 309         }
 310
 311         result = EBADF;
 312
 313 ExitRoutine:
 314         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 315                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 316
 317         return( result );
 318
 319 } /* aio_cancel */
 320
 321
 322 /*
 323  * _aio_close - internal function used to clean up async IO requests for
 324  * a file descriptor that is closing.
 325  * NOTE - kernel funnel lock is held when we get called.
 326  * THIS MAY BLOCK.
 327  */
 328
 329 __private_extern__ void
 330 _aio_close( struct proc *p, int fd )
 331 {
 332         int                     error, count;
 333
 334         /* quick check to see if there are any async IO requests queued up */
 335         AIO_LOCK;
 336         count = aio_get_all_queues_count( );
 337         AIO_UNLOCK;
 338         if ( count < 1 )
 339                 return;
 340
 341         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 342                           (int)p, fd, 0, 0, 0 );
 343
 344         /* cancel all async IO requests on our todo queues for this file descriptor */
 345         error = do_aio_cancel( p, fd, NULL, TRUE, FALSE );
 346         if ( error == AIO_NOTCANCELED ) {
 347                 /*
 348                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 349                  * and file descriptor on the active async IO queue.  Active requests cannot
 350                  * be cancelled so we must wait for them to complete.  We will get a special
 351                  * wake up call on our channel used to sleep for ALL active requests to
 352                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 353                  * when we must wait for all active aio requests.
 354                  */
 355
 356                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 357                                   (int)p, fd, 0, 0, 0 );
 358
 359                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
 360         }
 361
 362         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 363                           (int)p, fd, 0, 0, 0 );
 364
 365         return;
 366
 367 } /* _aio_close */
 368
 369
 370 /*
 371  * aio_error - return the error status associated with the async IO
 372  * request referred to by uap->aiocbp.  The error status is the errno
 373  * value that would be set by the corresponding IO request (read, wrtie,
 374  * fdatasync, or sync).
 375  */
 376
 377 int
 378 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
 379 {
 380         aio_workq_entry                         *entryp;
 381         int                                                     error;
 382
 383         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 384                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 385
 386         AIO_LOCK;
 387
 388         /* quick check to see if there are any async IO requests queued up */
 389         if ( aio_get_all_queues_count( ) < 1 ) {
 390                 error = EINVAL;
 391                 goto ExitRoutine;
 392         }
 393
 394         /* look for a match on our queue of async IO requests that have completed */
 395         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 396                 if ( entryp->uaiocbp == uap->aiocbp ) {
 397                         *retval = entryp->errorval;
 398                         error = 0;
 399                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 400                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 401                         goto ExitRoutine;
 402                 }
 403         }
 404
 405         /* look for a match on our queue of active async IO requests */
 406         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 407                 if ( entryp->uaiocbp == uap->aiocbp ) {
 408                         *retval = EINPROGRESS;
 409                         error = 0;
 410                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 411                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 412                         goto ExitRoutine;
 413                 }
 414         }
 415
 416         /* look for a match on our queue of todo work */
 417         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 418                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 419                         *retval = EINPROGRESS;
 420                         error = 0;
 421                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
 422                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 423                         goto ExitRoutine;
 424                 }
 425         }
 426         error = EINVAL;
 427
 428 ExitRoutine:
 429         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 430                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 431         AIO_UNLOCK;
 432
 433         return( error );
 434
 435 } /* aio_error */
 436
 437
 438 /*
 439  * aio_fsync - asynchronously force all IO operations associated
 440  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 441  * queued at the time of the call to the synchronized completion state.
 442  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 443  * fdatasync() call.
 444  */
 445
 446 int
 447 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
 448 {
 449         int                     error;
 450         int                     fsync_kind;
 451
 452         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 453                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 454
 455         *retval = 0;
 456         if ( uap->op == O_SYNC )
 457                 fsync_kind = AIO_FSYNC;
 458 #if 0 // we don't support fdatasync() call yet
 459         else if ( uap->op == O_DSYNC )
 460                 fsync_kind = AIO_DSYNC;
 461 #endif
 462         else {
 463                 *retval = -1;
 464                 error = EINVAL;
 465                 goto ExitRoutine;
 466         }
 467
 468         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 469         if ( error != 0 )
 470                 *retval = -1;
 471
 472 ExitRoutine:
 473         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 474                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 475
 476         return( error );
 477
 478 } /* aio_fsync */
 479
 480
 481 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 482  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 483  * (uap->aiocbp->aio_buf).
 484  */
 485
 486 int
 487 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
 488 {
 489         int                     error;
 490
 491         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 492                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 493
 494         *retval = 0;
 495
 496         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 497         if ( error != 0 )
 498                 *retval = -1;
 499
 500         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 501                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 502
 503         return( error );
 504
 505 } /* aio_read */
 506
 507
 508 /*
 509  * aio_return - return the return status associated with the async IO
 510  * request referred to by uap->aiocbp.  The return status is the value
 511  * that would be returned by corresponding IO request (read, wrtie,
 512  * fdatasync, or sync).  This is where we release kernel resources
 513  * held for async IO call associated with the given aiocb pointer.
 514  */
 515
 516 int
 517 aio_return( struct proc *p, struct aio_return_args *uap, register_t *retval )
 518 {
 519         aio_workq_entry                         *entryp;
 520         int                                                     error;
 521         boolean_t                                       lock_held;
 522
 523         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 524                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 525
 526         AIO_LOCK;
 527         lock_held = TRUE;
 528         *retval = 0;
 529
 530         /* quick check to see if there are any async IO requests queued up */
 531         if ( aio_get_all_queues_count( ) < 1 ) {
 532                 error = EINVAL;
 533                 goto ExitRoutine;
 534         }
 535
 536         /* look for a match on our queue of async IO requests that have completed */
 537         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 538                 if ( entryp->uaiocbp == uap->aiocbp ) {
 539                         TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 540                         aio_anchor.aio_done_count--;
 541                         p->aio_done_count--;
 542
 543                         *retval = entryp->returnval;
 544
 545                         /* we cannot free requests that are still completing */
 546                         if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 547                                 vm_map_t                my_map;
 548
 549                                 my_map = entryp->aio_map;
 550                                 entryp->aio_map = VM_MAP_NULL;
 551                                 AIO_UNLOCK;
 552                                 lock_held = FALSE;
 553                                 aio_free_request( entryp, my_map );
 554                         }
 555                         else
 556                                 /* tell completion code to free this request */
 557                                 entryp->flags |= AIO_DO_FREE;
 558                         error = 0;
 559                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 560                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 561                         goto ExitRoutine;
 562                 }
 563         }
 564
 565         /* look for a match on our queue of active async IO requests */
 566         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 567                 if ( entryp->uaiocbp == uap->aiocbp ) {
 568                         error = EINPROGRESS;
 569                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 570                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 571                         goto ExitRoutine;
 572                 }
 573         }
 574
 575         /* look for a match on our queue of todo work */
 576         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 577                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 578                         error = EINPROGRESS;
 579                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
 580                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 581                         goto ExitRoutine;
 582                 }
 583         }
 584         error = EINVAL;
 585
 586 ExitRoutine:
 587         if ( lock_held )
 588                 AIO_UNLOCK;
 589         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 590                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 591
 592         return( error );
 593
 594 } /* aio_return */
 595
 596
 597 /*
 598  * _aio_exec - internal function used to clean up async IO requests for
 599  * a process that is going away due to exec().  We cancel any async IOs
 600  * we can and wait for those already active.  We also disable signaling
 601  * for cancelled or active aio requests that complete.
 602  * NOTE - kernel funnel lock is held when we get called.
 603  * This routine MAY block!
 604  */
 605
 606 __private_extern__ void
 607 _aio_exec( struct proc *p )
 608 {
 609
 610         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 611                           (int)p, 0, 0, 0, 0 );
 612
 613         _aio_exit( p );
 614
 615         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 616                           (int)p, 0, 0, 0, 0 );
 617
 618         return;
 619
 620 } /* _aio_exec */
 621
 622
 623 /*
 624  * _aio_exit - internal function used to clean up async IO requests for
 625  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 626  * we can and wait for those already active.  We also disable signaling
 627  * for cancelled or active aio requests that complete.  This routine MAY block!
 628  * NOTE - kernel funnel lock is held when we get called.
 629  */
 630
 631 __private_extern__ void
 632 _aio_exit( struct proc *p )
 633 {
 634         int                                             error, count;
 635         aio_workq_entry                 *entryp;
 636
 637         /* quick check to see if there are any async IO requests queued up */
 638         AIO_LOCK;
 639         count = aio_get_all_queues_count( );
 640         AIO_UNLOCK;
 641         if ( count < 1 ) {
 642                 return;
 643         }
 644
 645         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 646                           (int)p, 0, 0, 0, 0 );
 647
 648         /*
 649          * cancel async IO requests on the todo work queue and wait for those
 650          * already active to complete.
 651          */
 652         error = do_aio_cancel( p, 0, NULL, TRUE, TRUE );
 653         if ( error == AIO_NOTCANCELED ) {
 654                 /*
 655                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 656                  * on the active async IO queue.  Active requests cannot be cancelled so we
 657                  * must wait for them to complete.  We will get a special wake up call on
 658                  * our channel used to sleep for ALL active requests to complete.  This sleep
 659                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 660                  * active aio requests.
 661                  */
 662
 663                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 664                                   (int)p, 0, 0, 0, 0 );
 665
 666                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
 667         }
 668
 669         /* release all aio resources used by this process */
 670         AIO_LOCK;
 671         entryp = TAILQ_FIRST( &p->aio_doneq );
 672         while ( entryp != NULL ) {
 673                 aio_workq_entry                 *next_entryp;
 674
 675                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 676                 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 677                 aio_anchor.aio_done_count--;
 678                 p->aio_done_count--;
 679
 680                 /* we cannot free requests that are still completing */
 681                 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 682                         vm_map_t                my_map;
 683
 684                         my_map = entryp->aio_map;
 685                         entryp->aio_map = VM_MAP_NULL;
 686                         AIO_UNLOCK;
 687                         aio_free_request( entryp, my_map );
 688
 689                         /* need to start over since aio_doneq may have been */
 690                         /* changed while we were away.  */
 691                         AIO_LOCK;
 692                         entryp = TAILQ_FIRST( &p->aio_doneq );
 693                         continue;
 694                 }
 695                 else
 696                         /* tell completion code to free this request */
 697                         entryp->flags |= AIO_DO_FREE;
 698                 entryp = next_entryp;
 699         }
 700         AIO_UNLOCK;
 701
 702 ExitRoutine:
 703         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 704                           (int)p, 0, 0, 0, 0 );
 705
 706         return;
 707
 708 } /* _aio_exit */
 709
 710
 711 /*
 712  * do_aio_cancel - cancel async IO requests (if possible).  We get called by
 713  * aio_cancel, close, and at exit.
 714  * There are three modes of operation: 1) cancel all async IOs for a process -
 715  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 716  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 717  * aiocbp.
 718  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 719  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 720  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 721  * were already complete.
 722  * WARNING - do not deference aiocbp in this routine, it may point to user
 723  * land data that has not been copied in (when called from aio_cancel() )
 724  * NOTE - kernel funnel lock is held when we get called.
 725  */
 726
 727 static int
 728 do_aio_cancel(  struct proc *p, int fd, struct aiocb *aiocbp,
 729                                 boolean_t wait_for_completion, boolean_t disable_notification )
 730 {
 731         aio_workq_entry                 *entryp;
 732         int                                             result;
 733
 734         result = -1;
 735
 736         /* look for a match on our queue of async todo work. */
 737         AIO_LOCK;
 738         entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 739         while ( entryp != NULL ) {
 740                 aio_workq_entry                 *next_entryp;
 741
 742                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 743                 if ( p == entryp->procp ) {
 744                         if ( (aiocbp == NULL && fd == 0) ||
 745                                  (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 746                                  (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 747                                 /* we found a match so we remove the entry from the */
 748                                 /* todo work queue and place it on the done queue */
 749                                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 750                                 aio_anchor.aio_async_workq_count--;
 751                                 entryp->errorval = ECANCELED;
 752                                 entryp->returnval = -1;
 753                                 if ( disable_notification )
 754                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 755                                 result = AIO_CANCELED;
 756
 757                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
 758                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 759
 760                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 761                                 aio_anchor.aio_done_count++;
 762                                 p->aio_done_count++;
 763                                 entryp->flags |= AIO_COMPLETION;
 764                                 AIO_UNLOCK;
 765
 766                                 /* do completion processing for this request */
 767                                 do_aio_completion( entryp );
 768
 769                                 AIO_LOCK;
 770                                 entryp->flags &= ~AIO_COMPLETION;
 771                                 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
 772                                         vm_map_t                my_map;
 773
 774                                         my_map = entryp->aio_map;
 775                                         entryp->aio_map = VM_MAP_NULL;
 776                                         AIO_UNLOCK;
 777                                         aio_free_request( entryp, my_map );
 778                                 }
 779                                 else
 780                                         AIO_UNLOCK;
 781
 782                                 if ( aiocbp != NULL ) {
 783                                         return( result );
 784                                 }
 785
 786                                 /* need to start over since aio_async_workq may have been */
 787                                 /* changed while we were away doing completion processing.  */
 788                                 AIO_LOCK;
 789                                 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 790                                 continue;
 791                         }
 792                 }
 793                 entryp = next_entryp;
 794         } /* while... */
 795
 796         /*
 797          * look for a match on our queue of synchronous todo work.  This will
 798          * be a rare occurrence but could happen if a process is terminated while
 799          * processing a lio_listio call.
 800          */
 801         entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 802         while ( entryp != NULL ) {
 803                 aio_workq_entry                 *next_entryp;
 804
 805                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 806                 if ( p == entryp->procp ) {
 807                         if ( (aiocbp == NULL && fd == 0) ||
 808                                  (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 809                                  (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 810                                 /* we found a match so we remove the entry from the */
 811                                 /* todo work queue and place it on the done queue */
 812                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 813                                 aio_anchor.lio_sync_workq_count--;
 814                                 entryp->errorval = ECANCELED;
 815                                 entryp->returnval = -1;
 816                                 if ( disable_notification )
 817                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 818                                 result = AIO_CANCELED;
 819
 820                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
 821                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 822
 823                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 824                                 aio_anchor.aio_done_count++;
 825                                 p->aio_done_count++;
 826                                 if ( aiocbp != NULL ) {
 827                                         AIO_UNLOCK;
 828                                         return( result );
 829                                 }
 830                         }
 831                 }
 832                 entryp = next_entryp;
 833         } /* while... */
 834
 835         /*
 836          * look for a match on our queue of active async IO requests and
 837          * return AIO_NOTCANCELED result.
 838          */
 839         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 840                 if ( (aiocbp == NULL && fd == 0) ||
 841                          (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 842                          (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 843                         result = AIO_NOTCANCELED;
 844
 845                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
 846                                                   (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 847
 848                         if ( wait_for_completion )
 849                                 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
 850                         if ( disable_notification )
 851                                 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 852                         if ( aiocbp != NULL ) {
 853                                 AIO_UNLOCK;
 854                                 return( result );
 855                         }
 856                 }
 857         }
 858
 859         /*
 860          * if we didn't find any matches on the todo or active queues then look for a
 861          * match on our queue of async IO requests that have completed and if found
 862          * return AIO_ALLDONE result.
 863          */
 864         if ( result == -1 ) {
 865                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 866                 if ( (aiocbp == NULL && fd == 0) ||
 867                          (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 868                          (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 869                                 result = AIO_ALLDONE;
 870
 871                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
 872                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 873
 874                                 if ( aiocbp != NULL ) {
 875                                         AIO_UNLOCK;
 876                                         return( result );
 877                                 }
 878                         }
 879                 }
 880         }
 881         AIO_UNLOCK;
 882
 883         return( result );
 884
 885 } /* do_aio_cancel */
 886
 887
 888 /*
 889  * aio_suspend - suspend the calling thread until at least one of the async
 890  * IO operations referenced by uap->aiocblist has completed, until a signal
 891  * interrupts the function, or uap->timeoutp time interval (optional) has
 892  * passed.
 893  * Returns 0 if one or more async IOs have completed else -1 and errno is
 894  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
 895  * woke us up.
 896  */
 897
 898 int
 899 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
 900 {
 901         int                                     error;
 902         int                                     i, count;
 903         uint64_t                        abstime;
 904         struct timespec         ts;
 905         struct timeval          tv;
 906         aio_workq_entry         *entryp;
 907         struct aiocb *          *aiocbpp;
 908
 909         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
 910                           (int)p, uap->nent, 0, 0, 0 );
 911
 912         *retval = -1;
 913         abstime = 0;
 914         aiocbpp = NULL;
 915
 916         /* quick check to see if there are any async IO requests queued up */
 917         AIO_LOCK;
 918         count = aio_get_all_queues_count( );
 919         AIO_UNLOCK;
 920         if ( count < 1 ) {
 921                 error = EINVAL;
 922                 goto ExitThisRoutine;
 923         }
 924
 925         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
 926                 error = EINVAL;
 927                 goto ExitThisRoutine;
 928         }
 929
 930         if ( uap->timeoutp != NULL ) {
 931                 error = copyin( (void *)uap->timeoutp, &ts, sizeof(ts) );
 932                 if ( error != 0 ) {
 933                         error = EAGAIN;
 934                         goto ExitThisRoutine;
 935                 }
 936
 937                 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
 938                         error = EINVAL;
 939                         goto ExitThisRoutine;
 940                 }
 941
 942                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
 943                                                                          &abstime );
 944                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
 945         }
 946
 947         MALLOC( aiocbpp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
 948         if ( aiocbpp == NULL ) {
 949                 error = EAGAIN;
 950                 goto ExitThisRoutine;
 951         }
 952
 953         /* copyin our aiocb pointers from list */
 954         for ( i = 0; i < uap->nent; i++ ) {
 955                 struct aiocb    *aiocbp;
 956
 957                 /* copyin in aiocb pointer from list */
 958                 error = copyin( (void *)(uap->aiocblist + i), (aiocbpp + i), sizeof(*aiocbpp) );
 959                 if ( error != 0 ) {
 960                         error = EAGAIN;
 961                         goto ExitThisRoutine;
 962                 }
 963         } /* for ( ; i < uap->nent; ) */
 964
 965         /* check list of aio requests to see if any have completed */
 966         AIO_LOCK;
 967         for ( i = 0; i < uap->nent; i++ ) {
 968                 struct aiocb    *aiocbp;
 969
 970                 /* NULL elements are legal so check for 'em */
 971                 aiocbp = *(aiocbpp + i);
 972                 if ( aiocbp == NULL )
 973                         continue;
 974
 975                 /* return immediately if any aio request in the list is done */
 976                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 977                         if ( entryp->uaiocbp == aiocbp ) {
 978                                 *retval = 0;
 979                                 error = 0;
 980                                 AIO_UNLOCK;
 981                                 goto ExitThisRoutine;
 982                         }
 983                 }
 984         } /* for ( ; i < uap->nent; ) */
 985
 986         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
 987                           (int)p, uap->nent, 0, 0, 0 );
 988
 989         /*
 990          * wait for an async IO to complete or a signal fires or timeout expires.
 991          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
 992          * interrupts us.  If an async IO completes before a signal fires or our
 993          * timeout expires, we get a wakeup call from aio_work_thread().  We do not
 994          * use tsleep() here in order to avoid getting kernel funnel lock.
 995          */
 996         assert_wait( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE );
 997         AIO_UNLOCK;
 998
 999         if ( abstime > 0 ) {
1000                 thread_set_timer_deadline( abstime );
1001         }
1002         error = thread_block( THREAD_CONTINUE_NULL );
1003
1004         if ( error == THREAD_AWAKENED ) {
1005                 /* got our wakeup call from aio_work_thread() */
1006                 if ( abstime > 0 ) {
1007                         thread_cancel_timer();
1008                 }
1009                 *retval = 0;
1010                 error = 0;
1011         }
1012         else if ( error == THREAD_TIMED_OUT ) {
1013                 /* our timeout expired */
1014                 error = EAGAIN;
1015         }
1016         else {
1017                 /* we were interrupted */
1018                 if ( abstime > 0 ) {
1019                         thread_cancel_timer();
1020                 }
1021                 error = EINTR;
1022         }
1023
1024 ExitThisRoutine:
1025         if ( aiocbpp != NULL )
1026                 FREE( aiocbpp, M_TEMP );
1027
1028         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1029                           (int)p, uap->nent, error, 0, 0 );
1030
1031         return( error );
1032
1033 } /* aio_suspend */
1034
1035
1036 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1037  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1038  * (uap->aiocbp->aio_buf).
1039  */
1040
1041 int
1042 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1043 {
1044         int                     error;
1045
1046         *retval = 0;
1047
1048         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1049                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1050
1051         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1052         if ( error != 0 )
1053                 *retval = -1;
1054
1055         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1056                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1057
1058         return( error );
1059
1060 } /* aio_write */
1061
1062
1063 /*
1064  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs
1065  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1066  * The caller gets error and return status for each aiocb in the list via aio_error
1067  * and aio_return.  We must keep completed requests until released by the
1068  * aio_return call.
1069  */
1070
1071 int
1072 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1073 {
1074         int                                                     i;
1075         int                                                     call_result;
1076         int                                                     result;
1077         long                                            group_tag;
1078         aio_workq_entry *                       *entryp_listp;
1079
1080         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1081                           (int)p, uap->nent, uap->mode, 0, 0 );
1082
1083         entryp_listp = NULL;
1084         call_result = -1;
1085         *retval = -1;
1086         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1087                 call_result = EINVAL;
1088                 goto ExitRoutine;
1089         }
1090
1091         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1092                 call_result = EINVAL;
1093                 goto ExitRoutine;
1094         }
1095
1096         /*
1097          * we use group_tag to mark IO requests for delayed completion processing
1098          * which means we wait until all IO requests in the group have completed
1099          * before we either return to the caller when mode is LIO_WAIT or signal
1100          * user when mode is LIO_NOWAIT.
1101          */
1102         group_tag = random();
1103
1104         /*
1105          * allocate a list of aio_workq_entry pointers that we will use to queue
1106          * up all our requests at once while holding our lock.
1107          */
1108         MALLOC( entryp_listp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
1109         if ( entryp_listp == NULL ) {
1110                 call_result = EAGAIN;
1111                 goto ExitRoutine;
1112         }
1113
1114         /* process list of aio requests */
1115         for ( i = 0; i < uap->nent; i++ ) {
1116                 struct aiocb    *my_aiocbp;
1117
1118                 *(entryp_listp + i) = NULL;
1119
1120                 /* copyin in aiocb pointer from list */
1121                 result = copyin( (void *)(uap->aiocblist + i), &my_aiocbp, sizeof(my_aiocbp) );
1122                 if ( result != 0 ) {
1123                         call_result = EAGAIN;
1124                         continue;
1125                 }
1126
1127                 /* NULL elements are legal so check for 'em */
1128                 if ( my_aiocbp == NULL )
1129                         continue;
1130
1131                 if ( uap->mode == LIO_NOWAIT )
1132                         result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1133                                                                                          group_tag, (entryp_listp + i) );
1134                 else
1135                         result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1136                                                                                         (entryp_listp + i) );
1137
1138                 if ( result != 0 && call_result == -1 )
1139                         call_result = result;
1140         }
1141
1142         /*
1143          * we need to protect this section since we do not want any of these grouped
1144          * IO requests to begin until we have them all on the queue.
1145          */
1146         AIO_LOCK;
1147         for ( i = 0; i < uap->nent; i++ ) {
1148                 aio_workq_entry                         *entryp;
1149
1150                 /* NULL elements are legal so check for 'em */
1151                 entryp = *(entryp_listp + i);
1152                 if ( entryp == NULL )
1153                         continue;
1154
1155                 /* check our aio limits to throttle bad or rude user land behavior */
1156                 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1157                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1158                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1159                         vm_map_t                my_map;
1160
1161                         my_map = entryp->aio_map;
1162                         entryp->aio_map = VM_MAP_NULL;
1163                         result = EAGAIN;
1164                         AIO_UNLOCK;
1165                         aio_free_request( entryp, my_map );
1166                         AIO_LOCK;
1167                         continue;
1168                 }
1169
1170                 /* place the request on the appropriate queue */
1171                 if ( uap->mode == LIO_NOWAIT ) {
1172                         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1173                         aio_anchor.aio_async_workq_count++;
1174
1175                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1176                                           (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1177                 }
1178                 else {
1179                         TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1180                         aio_anchor.lio_sync_workq_count++;
1181                 }
1182         }
1183
1184         if ( uap->mode == LIO_NOWAIT ) {
1185                 /* caller does not want to wait so we'll fire off a worker thread and return */
1186                 wakeup_one( &aio_anchor.aio_async_workq );
1187         }
1188         else {
1189                 aio_workq_entry                 *entryp;
1190                 int                                     error;
1191
1192                 /*
1193                  * mode is LIO_WAIT - handle the IO requests now.
1194                  */
1195                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1196                 while ( entryp != NULL ) {
1197                         if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1198                                 boolean_t       funnel_state;
1199
1200                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1201                                 aio_anchor.lio_sync_workq_count--;
1202                                 AIO_UNLOCK;
1203
1204                                 // file system IO code path requires kernel funnel lock
1205                                 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1206                                 if ( (entryp->flags & AIO_READ) != 0 ) {
1207                                         error = do_aio_read( entryp );
1208                                 }
1209                                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1210                                         error = do_aio_write( entryp );
1211                                 }
1212                                 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1213                                         error = do_aio_fsync( entryp );
1214                                 }
1215                                 else {
1216                                         printf( "%s - unknown aio request - flags 0x%02X \n",
1217                                                         __FUNCTION__, entryp->flags );
1218                                         error = EINVAL;
1219                                 }
1220                                 entryp->errorval = error;
1221                                 if ( error != 0 && call_result == -1 )
1222                                         call_result = EIO;
1223                                 (void) thread_funnel_set( kernel_flock, funnel_state );
1224
1225                                 AIO_LOCK;
1226                                 /* we're done with the IO request so move it on the done queue */
1227                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1228                                 aio_anchor.aio_done_count++;
1229                                 p->aio_done_count++;
1230
1231                                 /* need to start over since lio_sync_workq may have been changed while we */
1232                                 /* were away doing the IO.  */
1233                                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1234                                 continue;
1235                         } /* p == entryp->procp */
1236
1237                         entryp = TAILQ_NEXT( entryp, aio_workq_link );
1238         } /* while ( entryp != NULL ) */
1239         } /* uap->mode == LIO_WAIT */
1240         AIO_UNLOCK;
1241
1242         /* call_result == -1 means we had no trouble queueing up requests */
1243         if ( call_result == -1 ) {
1244                 call_result = 0;
1245                 *retval = 0;
1246         }
1247
1248 ExitRoutine:
1249         if ( entryp_listp != NULL )
1250                 FREE( entryp_listp, M_TEMP );
1251
1252         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1253                           (int)p, call_result, 0, 0, 0 );
1254
1255         return( call_result );
1256
1257 } /* lio_listio */
1258
1259
1260 /*
1261  * aio worker thread.  this is where all the real work gets done.
1262  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1263  * after new work is queued up.
1264  */
1265
1266 static void
1267 aio_work_thread( void )
1268 {
1269         aio_workq_entry                 *entryp;
1270         struct uthread                  *uthread = (struct uthread *)get_bsdthread_info(current_act());
1271
1272         for( ;; ) {
1273                 AIO_LOCK;
1274                 entryp = aio_get_some_work();
1275         if ( entryp == NULL ) {
1276                 /*
1277                  * aio worker threads wait for some work to get queued up
1278                  * by aio_queue_async_request.  Once some work gets queued
1279                  * it will wake up one of these worker threads just before
1280                  * returning to our caller in user land.   We do not use
1281                          * tsleep() here in order to avoid getting kernel funnel lock.
1282                  */
1283                         assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1284                         AIO_UNLOCK;
1285                         thread_block( THREAD_CONTINUE_NULL );
1286
1287                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_wake)) | DBG_FUNC_NONE,
1288                                                   0, 0, 0, 0, 0 );
1289         }
1290                 else {
1291                         int                     error;
1292                         boolean_t               funnel_state;
1293                         vm_map_t                currentmap;
1294                         vm_map_t                oldmap = VM_MAP_NULL;
1295                         task_t                  oldaiotask = TASK_NULL;
1296
1297                         AIO_UNLOCK;
1298
1299                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1300                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1301
1302                         /*
1303                          * Assume the target's address space identity for the duration
1304                          * of the IO.
1305                          */
1306                         funnel_state = thread_funnel_set( kernel_flock, TRUE );
1307
1308                         currentmap = get_task_map( (current_proc())->task );
1309                         if ( currentmap != entryp->aio_map ) {
1310                                 oldaiotask = uthread->uu_aio_task;
1311                                 uthread->uu_aio_task = entryp->procp->task;
1312                                 oldmap = vm_map_switch( entryp->aio_map );
1313                         }
1314
1315                         if ( (entryp->flags & AIO_READ) != 0 ) {
1316                                 error = do_aio_read( entryp );
1317                         }
1318                         else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1319                                 error = do_aio_write( entryp );
1320                         }
1321                         else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1322                                 error = do_aio_fsync( entryp );
1323                         }
1324                         else {
1325                                 printf( "%s - unknown aio request - flags 0x%02X \n",
1326                                                 __FUNCTION__, entryp->flags );
1327                                 error = EINVAL;
1328                         }
1329                         entryp->errorval = error;
1330                         if ( currentmap != entryp->aio_map ) {
1331                                 (void) vm_map_switch( oldmap );
1332                                 uthread->uu_aio_task = oldaiotask;
1333                         }
1334
1335                         /* we're done with the IO request so pop it off the active queue and */
1336                         /* push it on the done queue */
1337                         AIO_LOCK;
1338                         TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1339                         aio_anchor.aio_active_count--;
1340                         entryp->procp->aio_active_count--;
1341                         TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1342                         aio_anchor.aio_done_count++;
1343                         entryp->procp->aio_done_count++;
1344                         entryp->flags |= AIO_COMPLETION;
1345
1346                         /* remove our reference to the user land map. */
1347                         if ( VM_MAP_NULL != entryp->aio_map ) {
1348                                 vm_map_t                my_map;
1349
1350                                 my_map = entryp->aio_map;
1351                                 entryp->aio_map = VM_MAP_NULL;
1352                                 AIO_UNLOCK;  /* must unlock before calling vm_map_deallocate() */
1353                                 vm_map_deallocate( my_map );
1354                         }
1355                         else {
1356                                 AIO_UNLOCK;
1357                         }
1358
1359                         do_aio_completion( entryp );
1360                         (void) thread_funnel_set( kernel_flock, funnel_state );
1361
1362                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1363                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1364                                                   entryp->returnval, 0 );
1365
1366                         AIO_LOCK;
1367                         entryp->flags &= ~AIO_COMPLETION;
1368                         if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1369                                 vm_map_t                my_map;
1370
1371                                 my_map = entryp->aio_map;
1372                                 entryp->aio_map = VM_MAP_NULL;
1373                                 AIO_UNLOCK;
1374                                 aio_free_request( entryp, my_map );
1375                         }
1376                         else
1377                                 AIO_UNLOCK;
1378                 }
1379         } /* for ( ;; ) */
1380
1381         /* NOT REACHED */
1382
1383 } /* aio_work_thread */
1384
1385
1386 /*
1387  * aio_get_some_work - get the next async IO request that is ready to be executed.
1388  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1389  * IO requests at the time the aio_fsync call came in have completed.
1390  * NOTE - AIO_LOCK must be held by caller
1391  */
1392
1393 static aio_workq_entry *
1394 aio_get_some_work( void )
1395 {
1396         aio_workq_entry                         *entryp;
1397         int                                                     skip_count = 0;
1398
1399         /* pop some work off the work queue and add to our active queue */
1400         for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1401                   entryp != NULL;
1402                   entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1403
1404                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1405                         /* leave aio_fsync calls on the work queue if there are IO */
1406                         /* requests on the active queue for the same file descriptor. */
1407                         if ( aio_delay_fsync_request( entryp ) ) {
1408
1409                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1410                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1411                                 continue;
1412                         }
1413                 }
1414                 break;
1415         }
1416
1417         if ( entryp != NULL ) {
1418                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1419                 aio_anchor.aio_async_workq_count--;
1420                 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1421                 aio_anchor.aio_active_count++;
1422                 entryp->procp->aio_active_count++;
1423         }
1424
1425         return( entryp );
1426
1427 } /* aio_get_some_work */
1428
1429
1430 /*
1431  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1432  * this time.  Delay will happen when there are any active IOs for the same file
1433  * descriptor that were queued at time the aio_sync call was queued.
1434  * NOTE - AIO_LOCK must be held by caller
1435  */
1436 static boolean_t
1437 aio_delay_fsync_request( aio_workq_entry *entryp )
1438 {
1439         aio_workq_entry                 *my_entryp;
1440
1441         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1442                 if ( my_entryp->fsyncp != NULL &&
1443                          entryp->uaiocbp == my_entryp->fsyncp &&
1444                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1445                         return( TRUE );
1446                 }
1447         }
1448
1449         return( FALSE );
1450
1451 } /* aio_delay_fsync_request */
1452
1453
1454 /*
1455  * aio_queue_async_request - queue up an async IO request on our work queue then
1456  * wake up one of our worker threads to do the actual work.  We get a reference
1457  * to our caller's user land map in order to keep it around while we are
1458  * processing the request.
1459  */
1460
1461 static int
1462 aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO )
1463 {
1464         aio_workq_entry                 *entryp;
1465         int                                             result;
1466
1467         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1468         if ( entryp == NULL ) {
1469                 result = EAGAIN;
1470                 goto error_exit;
1471         }
1472         bzero( entryp, sizeof(*entryp) );
1473
1474         /* fill in the rest of the aio_workq_entry */
1475         entryp->procp = procp;
1476         entryp->uaiocbp = aiocbp;
1477         entryp->flags |= kindOfIO;
1478         entryp->aio_map = VM_MAP_NULL;
1479         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1480         if ( result != 0 ) {
1481                 result = EAGAIN;
1482                 goto error_exit;
1483         }
1484
1485         /* do some more validation on the aiocb and embedded file descriptor */
1486         result = aio_validate( entryp );
1487         if ( result != 0 )
1488                 goto error_exit;
1489
1490         /* get a reference to the user land map in order to keep it around */
1491         entryp->aio_map = get_task_map( procp->task );
1492         vm_map_reference( entryp->aio_map );
1493
1494         AIO_LOCK;
1495
1496         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1497                 AIO_UNLOCK;
1498                 result = EAGAIN;
1499                 goto error_exit;
1500         }
1501
1502         /* check our aio limits to throttle bad or rude user land behavior */
1503         if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1504                  aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1505                 AIO_UNLOCK;
1506                 result = EAGAIN;
1507                 goto error_exit;
1508         }
1509
1510         /*
1511          * aio_fsync calls sync up all async IO requests queued at the time
1512          * the aio_fsync call was made.  So we mark each currently queued async
1513          * IO with a matching file descriptor as must complete before we do the
1514          * fsync.  We set the fsyncp field of each matching async IO
1515          * request with the aiocb pointer passed in on the aio_fsync call to
1516          * know which IOs must complete before we process the aio_fsync call.
1517          */
1518         if ( (kindOfIO & AIO_FSYNC) != 0 )
1519                 aio_mark_requests( entryp );
1520
1521         /* queue up on our aio asynchronous work queue */
1522         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1523         aio_anchor.aio_async_workq_count++;
1524
1525         wakeup_one( &aio_anchor.aio_async_workq );
1526         AIO_UNLOCK;
1527
1528         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1529                           (int)procp, (int)aiocbp, 0, 0, 0 );
1530
1531         return( 0 );
1532
1533 error_exit:
1534         if ( entryp != NULL ) {
1535                 /* this entry has not been queued up so no worries about unlocked */
1536                 /* state and aio_map */
1537                 aio_free_request( entryp, entryp->aio_map );
1538         }
1539
1540         return( result );
1541
1542 } /* aio_queue_async_request */
1543
1544
1545 /*
1546  * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1547  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1548  * our caller.  We get a reference to our caller's user land map in order to keep
1549  * it around while we are processing the request.
1550  * lio_listio calls behave differently at completion they do completion notification
1551  * when all async IO requests have completed.  We use group_tag to tag IO requests
1552  * that behave in the delay notification manner.
1553  */
1554
1555 static int
1556 lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp,
1557                                                  struct sigevent *sigp, long group_tag,
1558                                                  aio_workq_entry **entrypp )
1559 {
1560         aio_workq_entry                         *entryp;
1561         int                                                     result;
1562
1563         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1564         if ( entryp == NULL ) {
1565                 result = EAGAIN;
1566                 goto error_exit;
1567         }
1568         bzero( entryp, sizeof(*entryp) );
1569
1570         /* fill in the rest of the aio_workq_entry */
1571         entryp->procp = procp;
1572         entryp->uaiocbp = aiocbp;
1573         entryp->flags |= AIO_LIO;
1574         entryp->group_tag = group_tag;
1575         entryp->aio_map = VM_MAP_NULL;
1576         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1577         if ( result != 0 ) {
1578                 result = EAGAIN;
1579                 goto error_exit;
1580         }
1581
1582         /* look for lio_listio LIO_NOP requests and ignore them. */
1583         /* Not really an error, but we need to free our aio_workq_entry.  */
1584         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1585                 result = 0;
1586                 goto error_exit;
1587         }
1588
1589         /* use sigevent passed in to lio_listio for each of our calls, but only */
1590         /* do completion notification after the last request completes. */
1591         if ( sigp != NULL ) {
1592                 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1593                 if ( result != 0 ) {
1594                         result = EAGAIN;
1595                         goto error_exit;
1596                 }
1597         }
1598
1599         /* do some more validation on the aiocb and embedded file descriptor */
1600         result = aio_validate( entryp );
1601         if ( result != 0 )
1602                 goto error_exit;
1603
1604         /* get a reference to the user land map in order to keep it around */
1605         entryp->aio_map = get_task_map( procp->task );
1606         vm_map_reference( entryp->aio_map );
1607
1608         *entrypp = entryp;
1609         return( 0 );
1610
1611 error_exit:
1612         if ( entryp != NULL )
1613                 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1614
1615         return( result );
1616
1617 } /* lio_create_async_entry */
1618
1619
1620 /*
1621  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1622  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp
1623  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp
1624  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1625  * NOTE - AIO_LOCK must be held by caller
1626  */
1627
1628 static void
1629 aio_mark_requests( aio_workq_entry *entryp )
1630 {
1631         aio_workq_entry                 *my_entryp;
1632
1633         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1634                 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1635                         my_entryp->fsyncp = entryp->uaiocbp;
1636                 }
1637         }
1638
1639         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1640                 if ( entryp->procp == my_entryp->procp &&
1641                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1642                         my_entryp->fsyncp = entryp->uaiocbp;
1643                 }
1644         }
1645
1646 } /* aio_mark_requests */
1647
1648
1649 /*
1650  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1651  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1652  * our caller.
1653  * lio_listio calls behave differently at completion they do completion notification
1654  * when all async IO requests have completed.  We use group_tag to tag IO requests
1655  * that behave in the delay notification manner.
1656  */
1657
1658 static int
1659 lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp,
1660                                                 long group_tag, aio_workq_entry **entrypp )
1661 {
1662         aio_workq_entry                         *entryp;
1663         int                                                     result;
1664
1665         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1666         if ( entryp == NULL ) {
1667                 result = EAGAIN;
1668                 goto error_exit;
1669         }
1670         bzero( entryp, sizeof(*entryp) );
1671
1672         /* fill in the rest of the aio_workq_entry */
1673         entryp->procp = procp;
1674         entryp->uaiocbp = aiocbp;
1675         entryp->flags |= AIO_LIO;
1676         entryp->group_tag = group_tag;
1677         entryp->aio_map = VM_MAP_NULL;
1678         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1679         if ( result != 0 ) {
1680                 result = EAGAIN;
1681                 goto error_exit;
1682         }
1683
1684         /* look for lio_listio LIO_NOP requests and ignore them. */
1685         /* Not really an error, but we need to free our aio_workq_entry.  */
1686         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1687                 result = 0;
1688                 goto error_exit;
1689         }
1690
1691         result = aio_validate( entryp );
1692         if ( result != 0 ) {
1693                 goto error_exit;
1694         }
1695
1696         *entrypp = entryp;
1697         return( 0 );
1698
1699 error_exit:
1700         if ( entryp != NULL )
1701                 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1702
1703         return( result );
1704
1705 } /* lio_create_sync_entry */
1706
1707
1708 /*
1709  * aio_free_request - remove our reference on the user land map and
1710  * free the work queue entry resources.
1711  * We are not holding the lock here thus aio_map is passed in and
1712  * zeroed while we did have the lock.
1713  */
1714
1715 static int
1716 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1717 {
1718         /* remove our reference to the user land map. */
1719         if ( VM_MAP_NULL != the_map ) {
1720                 vm_map_deallocate( the_map );
1721         }
1722
1723         zfree( aio_workq_zonep, (vm_offset_t) entryp );
1724
1725         return( 0 );
1726
1727 } /* aio_free_request */
1728
1729
1730 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1731  */
1732
1733 static int
1734 aio_validate( aio_workq_entry *entryp )
1735 {
1736         boolean_t                                       funnel_state;
1737         struct file                             *fp;
1738         int                                                     flag;
1739         int                                                     result;
1740
1741         result = 0;
1742
1743         if ( (entryp->flags & AIO_LIO) != 0 ) {
1744                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1745                         entryp->flags |= AIO_READ;
1746                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1747                         entryp->flags |= AIO_WRITE;
1748                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1749                         return( 0 );
1750                 else
1751                         return( EINVAL );
1752         }
1753
1754         flag = FREAD;
1755         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1756                 flag = FWRITE;
1757         }
1758
1759         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1760                 if ( entryp->aiocb.aio_offset < 0                       ||
1761                          entryp->aiocb.aio_nbytes < 0                   ||
1762                          entryp->aiocb.aio_nbytes > INT_MAX     ||
1763                          entryp->aiocb.aio_buf == NULL )
1764                         return( EINVAL );
1765         }
1766
1767         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify
1768          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value,
1769          * sigev_notify_function, and sigev_notify_attributes are ignored.
1770          */
1771         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1772                 int             signum;
1773                 /* make sure we have a valid signal number */
1774                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1775                 if ( signum <= 0 || signum >= NSIG ||
1776                          signum == SIGKILL || signum == SIGSTOP )
1777                         return (EINVAL);
1778         }
1779         else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1780                 return (EINVAL);
1781
1782         /* validate the file descriptor and that the file was opened
1783          * for the appropriate read / write access.  This section requires
1784          * kernel funnel lock.
1785          */
1786         funnel_state = thread_funnel_set( kernel_flock, TRUE );
1787
1788         result = fdgetf( entryp->procp, entryp->aiocb.aio_fildes, &fp );
1789         if ( result == 0 ) {
1790                 if ( (fp->f_flag & flag) == 0 ) {
1791                         /* we don't have read or write access */
1792                         result = EBADF;
1793                 }
1794                 else if ( fp->f_type != DTYPE_VNODE ) {
1795                         /* this is not a file */
1796                         result = ESPIPE;
1797                 }
1798         }
1799         else {
1800                 result = EBADF;
1801         }
1802
1803         (void) thread_funnel_set( kernel_flock, funnel_state );
1804
1805         return( result );
1806
1807 } /* aio_validate */
1808
1809
1810 /*
1811  * aio_get_process_count - runs through our queues that hold outstanding
1812  * async IO reqests and totals up number of requests for the given
1813  * process.
1814  * NOTE - caller must hold aio lock!
1815  */
1816
1817 static int
1818 aio_get_process_count( struct proc *procp )
1819 {
1820         aio_workq_entry                         *entryp;
1821         int                                                     error;
1822         int                                                     count;
1823
1824         /* begin with count of completed async IO requests for this process */
1825         count = procp->aio_done_count;
1826
1827         /* add in count of active async IO requests for this process */
1828         count += procp->aio_active_count;
1829
1830         /* look for matches on our queue of asynchronous todo work */
1831         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1832                 if ( procp == entryp->procp ) {
1833                         count++;
1834                 }
1835         }
1836
1837         /* look for matches on our queue of synchronous todo work */
1838         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1839                 if ( procp == entryp->procp ) {
1840                         count++;
1841                 }
1842         }
1843
1844         return( count );
1845
1846 } /* aio_get_process_count */
1847
1848
1849 /*
1850  * aio_get_all_queues_count - get total number of entries on all aio work queues.
1851  * NOTE - caller must hold aio lock!
1852  */
1853
1854 static int
1855 aio_get_all_queues_count( void )
1856 {
1857         int                                                     count;
1858
1859         count = aio_anchor.aio_async_workq_count;
1860         count += aio_anchor.lio_sync_workq_count;
1861         count += aio_anchor.aio_active_count;
1862         count += aio_anchor.aio_done_count;
1863
1864         return( count );
1865
1866 } /* aio_get_all_queues_count */
1867
1868
1869 /*
1870  * do_aio_completion.  Handle async IO completion.
1871  */
1872
1873 static void
1874 do_aio_completion( aio_workq_entry *entryp )
1875 {
1876         /* signal user land process if appropriate */
1877         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1878                  (entryp->flags & AIO_DISABLE) == 0 ) {
1879
1880                 /*
1881                  * if group_tag is non zero then make sure this is the last IO request
1882                  * in the group before we signal.
1883                  */
1884                 if ( entryp->group_tag == 0 ||
1885                          (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1886                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1887                                                   (int)entryp->procp, (int)entryp->uaiocbp,
1888                                                   entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1889
1890                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1891                         return;
1892                 }
1893         }
1894
1895         /*
1896          * need to handle case where a process is trying to exit, exec, or close
1897          * and is currently waiting for active aio requests to complete.  If
1898          * AIO_WAITING is set then we need to look to see if there are any
1899          * other requests in the active queue for this process.  If there are
1900          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If
1901          * there are some still active then do nothing - we only want to wakeup
1902          * when all active aio requests for the process are complete.
1903          */
1904         if ( (entryp->flags & AIO_WAITING) != 0 ) {
1905                 int             active_requests;
1906
1907                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1908                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1909
1910                 AIO_LOCK;
1911                 active_requests = aio_active_requests_for_process( entryp->procp );
1912                 //AIO_UNLOCK;
1913                 if ( active_requests < 1 ) {
1914                         /* no active aio requests for this process, continue exiting */
1915                         wakeup_one( &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1916
1917                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1918                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1919                 }
1920                 AIO_UNLOCK;
1921                 return;
1922         }
1923
1924         /*
1925          * aio_suspend case when a signal was not requested.  In that scenario we
1926          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1927          * NOTE - the assumption here is that this wakeup call is inexpensive.
1928          * we really only need to do this when an aio_suspend call is pending.
1929          * If we find the wakeup call should be avoided we could mark the
1930          * async IO requests given in the list provided by aio_suspend and only
1931          * call wakeup for them.  If we do mark them we should unmark them after
1932          * the aio_suspend wakes up.
1933          */
1934         AIO_LOCK;
1935         wakeup_one( &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1936         AIO_UNLOCK;
1937
1938         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1939                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1940
1941         return;
1942
1943 } /* do_aio_completion */
1944
1945
1946 /*
1947  * aio_last_group_io - checks to see if this is the last unfinished IO request
1948  * for the given group_tag.  Returns TRUE if there are no other active IO
1949  * requests for this group or FALSE if the are active IO requests
1950  * NOTE - AIO_LOCK must be held by caller
1951  */
1952
1953 static boolean_t
1954 aio_last_group_io( aio_workq_entry *entryp )
1955 {
1956         aio_workq_entry                         *my_entryp;
1957
1958         /* look for matches on our queue of active async IO requests */
1959         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1960                 if ( my_entryp->group_tag == entryp->group_tag )
1961                         return( FALSE );
1962         }
1963
1964         /* look for matches on our queue of asynchronous todo work */
1965         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1966                 if ( my_entryp->group_tag == entryp->group_tag )
1967                         return( FALSE );
1968         }
1969
1970         /* look for matches on our queue of synchronous todo work */
1971         TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1972                 if ( my_entryp->group_tag == entryp->group_tag )
1973                         return( FALSE );
1974         }
1975
1976         return( TRUE );
1977
1978 } /* aio_last_group_io */
1979
1980
1981 /*
1982  * do_aio_read
1983  */
1984 static int
1985 do_aio_read( aio_workq_entry *entryp )
1986 {
1987         struct file                     *fp;
1988         int                                             error;
1989
1990         fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FREAD );
1991         if ( fp != NULL ) {
1992                 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
1993                                                         (void *)entryp->aiocb.aio_buf,
1994                                                         entryp->aiocb.aio_nbytes,
1995                                                         entryp->aiocb.aio_offset, FOF_OFFSET,
1996                                                         &entryp->returnval );
1997                 frele( fp );
1998         }
1999         else
2000                 error = EBADF;
2001
2002         return( error );
2003
2004 } /* do_aio_read */
2005
2006
2007 /*
2008  * do_aio_write
2009  */
2010 static int
2011 do_aio_write( aio_workq_entry *entryp )
2012 {
2013         struct file                     *fp;
2014         int                                             error;
2015
2016         fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FWRITE );
2017         if ( fp != NULL ) {
2018                 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes,
2019                                                          (const void *)entryp->aiocb.aio_buf,
2020                                                          entryp->aiocb.aio_nbytes,
2021                                                          entryp->aiocb.aio_offset, FOF_OFFSET,
2022                                                          &entryp->returnval );
2023                 frele( fp );
2024         }
2025         else
2026                 error = EBADF;
2027
2028         return( error );
2029
2030 } /* do_aio_write */
2031
2032
2033 /*
2034  * aio_active_requests_for_process - return number of active async IO
2035  * requests for the given process.
2036  * NOTE - caller must hold aio lock!
2037  */
2038
2039 static int
2040 aio_active_requests_for_process( struct proc *procp )
2041 {
2042
2043         return( procp->aio_active_count );
2044
2045 } /* aio_active_requests_for_process */
2046
2047
2048 /*
2049  * do_aio_fsync
2050  */
2051 static int
2052 do_aio_fsync( aio_workq_entry *entryp )
2053 {
2054         register struct vnode   *vp;
2055         struct file                     *fp;
2056         int                                             error;
2057
2058         /*
2059          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2060          * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2061          * The following was shamelessly extracted from fsync() implementation.
2062          */
2063         error = getvnode( entryp->procp, entryp->aiocb.aio_fildes, &fp );
2064         if ( error == 0 ) {
2065                 vp = (struct vnode *)fp->f_data;
2066                 vn_lock( vp, LK_EXCLUSIVE | LK_RETRY, entryp->procp );
2067                 error = VOP_FSYNC( vp, fp->f_cred, MNT_WAIT, entryp->procp );
2068                 VOP_UNLOCK( vp, 0, entryp->procp );
2069         }
2070         if ( error != 0 )
2071                 entryp->returnval = -1;
2072
2073         return( error );
2074
2075 } /* do_aio_fsync */
2076
2077
2078 /*
2079  * is_already_queued - runs through our queues to see if the given
2080  * aiocbp / process is there.  Returns TRUE if there is a match
2081  * on any of our aio queues.
2082  * NOTE - callers must hold aio lock!
2083  */
2084
2085 static boolean_t
2086 is_already_queued(      struct proc *procp,
2087                                         struct aiocb *aiocbp )
2088 {
2089         aio_workq_entry                 *entryp;
2090         boolean_t                               result;
2091
2092         result = FALSE;
2093
2094         /* look for matches on our queue of async IO requests that have completed */
2095         TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2096                 if ( aiocbp == entryp->uaiocbp ) {
2097                         result = TRUE;
2098                         goto ExitThisRoutine;
2099                 }
2100         }
2101
2102         /* look for matches on our queue of active async IO requests */
2103         TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2104                 if ( aiocbp == entryp->uaiocbp ) {
2105                         result = TRUE;
2106                         goto ExitThisRoutine;
2107                 }
2108         }
2109
2110         /* look for matches on our queue of asynchronous todo work */
2111         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2112                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2113                         result = TRUE;
2114                         goto ExitThisRoutine;
2115                 }
2116         }
2117
2118         /* look for matches on our queue of synchronous todo work */
2119         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2120                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2121                         result = TRUE;
2122                         goto ExitThisRoutine;
2123                 }
2124         }
2125
2126 ExitThisRoutine:
2127         return( result );
2128
2129 } /* is_already_queued */
2130
2131
2132 /*
2133  * aio initialization
2134  */
2135 __private_extern__ void
2136 aio_init( void )
2137 {
2138         int                     i;
2139
2140         simple_lock_init( &aio_lock );
2141
2142         AIO_LOCK;
2143         TAILQ_INIT( &aio_anchor.aio_async_workq );
2144         TAILQ_INIT( &aio_anchor.lio_sync_workq );
2145         aio_anchor.aio_async_workq_count = 0;
2146         aio_anchor.lio_sync_workq_count = 0;
2147         aio_anchor.aio_active_count = 0;
2148         aio_anchor.aio_done_count = 0;
2149         AIO_UNLOCK;
2150
2151         i = sizeof( aio_workq_entry );
2152         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2153
2154         _aio_create_worker_threads( aio_worker_threads );
2155
2156         return;
2157
2158 } /* aio_init */
2159
2160
2161 /*
2162  * aio worker threads created here.
2163  */
2164 __private_extern__ void
2165 _aio_create_worker_threads( int num )
2166 {
2167         int                     i;
2168
2169         /* create some worker threads to handle the async IO requests */
2170         for ( i = 0; i < num; i++ ) {
2171                 thread_t                myThread;
2172
2173                 myThread = kernel_thread( kernel_task, aio_work_thread );
2174                 if ( THREAD_NULL == myThread ) {
2175                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2176                 }
2177         }
2178
2179         return;
2180
2181 } /* _aio_create_worker_threads */
2182
2183 /*
2184  * Return the current activation utask
2185  */
2186 task_t
2187 get_aiotask(void)
2188 {
2189         return  ((struct uthread *)get_bsdthread_info(current_act()))->uu_aio_task;
2190 }