bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23
  24 /*
  25  * todo:
  26  *              1) ramesh is looking into how to replace taking a reference on
  27  *                      the user's map (vm_map_reference()) since it is believed that
  28  *                      would not hold the process for us.
  29  *              2) david is looking into a way for us to set the priority of the
  30  *                      worker threads to match that of the user's thread when the
  31  *                      async IO was queued.
  32  */
  33
  34
  35 /*
  36  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  37  */
  38
  39 #include <sys/systm.h>
  40 #include <sys/buf.h>
  41 #include <sys/fcntl.h>
  42 #include <sys/file.h>
  43 #include <sys/filedesc.h>
  44 #include <sys/kernel.h>
  45 #include <sys/vnode.h>
  46 #include <sys/malloc.h>
  47 #include <sys/mount.h>
  48 #include <sys/param.h>
  49 #include <sys/proc.h>
  50 #include <sys/sysctl.h>
  51 #include <sys/unistd.h>
  52 #include <sys/user.h>
  53
  54 #include <sys/aio_kern.h>
  55
  56 #include <machine/limits.h>
  57 #include <kern/zalloc.h>
  58 #include <kern/task.h>
  59
  60 #include <sys/kdebug.h>
  61 #define AIO_work_queued                                 1
  62 #define AIO_worker_wake                                 2
  63 #define AIO_completion_sig                              3
  64 #define AIO_completion_cleanup_wait             4
  65 #define AIO_completion_cleanup_wake             5
  66 #define AIO_completion_suspend_wake     6
  67 #define AIO_fsync_delay                                 7
  68 #define AIO_cancel                                              10
  69 #define AIO_cancel_async_workq                  11
  70 #define AIO_cancel_sync_workq                   12
  71 #define AIO_cancel_activeq                              13
  72 #define AIO_cancel_doneq                                14
  73 #define AIO_fsync                                               20
  74 #define AIO_read                                                30
  75 #define AIO_write                                               40
  76 #define AIO_listio                                              50
  77 #define AIO_error                                               60
  78 #define AIO_error_val                                   61
  79 #define AIO_error_activeq                               62
  80 #define AIO_error_workq                                 63
  81 #define AIO_return                                              70
  82 #define AIO_return_val                                  71
  83 #define AIO_return_activeq                              72
  84 #define AIO_return_workq                                73
  85 #define AIO_exec                                                80
  86 #define AIO_exit                                                90
  87 #define AIO_exit_sleep                                  91
  88 #define AIO_close                                               100
  89 #define AIO_close_sleep                                 101
  90 #define AIO_suspend                                             110
  91 #define AIO_suspend_sleep                               111
  92 #define AIO_worker_thread                               120
  93
  94 #if 0
  95 #undef KERNEL_DEBUG
  96 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
  97 #endif
  98
  99 /*
 100  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 101  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 102  * (proc.aio_activeq) when one of our worker threads start the IO.
 103  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 104  * when the IO request completes.  The request remains on aio_doneq until
 105  * user process calls aio_return or the process exits, either way that is our
 106  * trigger to release aio resources.
 107  */
 108 struct aio_anchor_cb
 109 {
 110         int                                                                     aio_async_workq_count;  /* entries on aio_async_workq */
 111         int                                                                     lio_sync_workq_count;   /* entries on lio_sync_workq */
 112         int                                                                     aio_active_count;       /* entries on all active queues (proc.aio_activeq) */
 113         int                                                                     aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 114         TAILQ_HEAD( , aio_workq_entry )         aio_async_workq;
 115         TAILQ_HEAD( , aio_workq_entry )         lio_sync_workq;
 116 };
 117 typedef struct aio_anchor_cb aio_anchor_cb;
 118
 119
 120 /*
 121  * Notes on aio sleep / wake channels.
 122  * We currently pick a couple fields within the proc structure that will allow
 123  * us sleep channels that currently do not collide with any other kernel routines.
 124  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 125  */
 126 #define AIO_SUSPEND_SLEEP_CHAN  p_estcpu
 127 #define AIO_CLEANUP_SLEEP_CHAN  p_pctcpu
 128
 129
 130 /*
 131  * aysnc IO locking macros used to protect critical sections.
 132  */
 133 #define AIO_LOCK        usimple_lock( &aio_lock )
 134 #define AIO_UNLOCK      usimple_unlock( &aio_lock )
 135
 136
 137 /*
 138  *  LOCAL PROTOTYPES
 139  */
 140 static int                      aio_active_requests_for_process( struct proc *procp );
 141 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 142 static int                      aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
 143 static int                      aio_get_all_queues_count( void );
 144 static int                      aio_get_process_count( struct proc *procp );
 145 static aio_workq_entry *  aio_get_some_work( void );
 146 static boolean_t        aio_last_group_io( aio_workq_entry *entryp );
 147 static void                     aio_mark_requests( aio_workq_entry *entryp );
 148 static int                      aio_queue_async_request( struct proc *procp,
 149                                                                                          struct aiocb *aiocbp,
 150                                                                                          int kindOfIO );
 151 static int                      aio_validate( aio_workq_entry *entryp );
 152 static void                     aio_work_thread( void );
 153 static int                      do_aio_cancel(  struct proc *p,
 154                                                                         int fd,
 155                                                                         struct aiocb *aiocbp,
 156                                                                         boolean_t wait_for_completion,
 157                                                                         boolean_t disable_notification );
 158 static void                     do_aio_completion( aio_workq_entry *entryp );
 159 static int                      do_aio_fsync( aio_workq_entry *entryp );
 160 static int                      do_aio_read( aio_workq_entry *entryp );
 161 static int                      do_aio_write( aio_workq_entry *entryp );
 162 static boolean_t        is_already_queued(      struct proc *procp,
 163                                                                                 struct aiocb *aiocbp );
 164 static int                      lio_create_async_entry( struct proc *procp,
 165                                                                                          struct aiocb *aiocbp,
 166                                                                                          struct sigevent *sigp,
 167                                                                                          long group_tag,
 168                                                                                          aio_workq_entry **entrypp );
 169 static int                      lio_create_sync_entry( struct proc *procp,
 170                                                                                         struct aiocb *aiocbp,
 171                                                                                         long group_tag,
 172                                                                                         aio_workq_entry **entrypp );
 173
 174 /*
 175  *  EXTERNAL PROTOTYPES
 176  */
 177
 178 /* in ...bsd/kern/sys_generic.c */
 179 extern struct file*     holdfp( struct filedesc* fdp, int fd, int flag );
 180 extern int                      dofileread( struct proc *p, struct file *fp, int fd,
 181                                                                 void *buf, size_t nbyte, off_t offset,
 182                                                                 int flags, int *retval );
 183 extern int                      dofilewrite( struct proc *p, struct file *fp, int fd,
 184                                                                  const void *buf, size_t nbyte, off_t offset,
 185                                                                  int flags, int *retval );
 186 extern vm_map_t         vm_map_switch( vm_map_t    map );
 187
 188
 189 /*
 190  * aio external global variables.
 191  */
 192 extern int aio_max_requests;                            /* AIO_MAX - configurable */
 193 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 194 extern int aio_worker_threads;                          /* AIO_THREAD_COUNT - configurable */
 195
 196
 197 /*
 198  * aio static variables.
 199  */
 200 static aio_anchor_cb            aio_anchor;
 201 static simple_lock_data_t       aio_lock;
 202 static struct zone              *aio_workq_zonep;
 203
 204
 205 /*
 206  * syscall input parameters
 207  */
 208 #ifndef _SYS_SYSPROTO_H_
 209
 210 struct  aio_cancel_args {
 211         int                             fd;
 212         struct aiocb    *aiocbp;
 213 };
 214
 215 struct  aio_error_args {
 216         struct aiocb                    *aiocbp;
 217 };
 218
 219 struct  aio_fsync_args {
 220         int                                             op;
 221         struct aiocb                    *aiocbp;
 222 };
 223
 224 struct  aio_read_args {
 225         struct aiocb                    *aiocbp;
 226 };
 227
 228 struct  aio_return_args {
 229         struct aiocb    *aiocbp;
 230 };
 231
 232 struct  aio_suspend_args {
 233         struct aiocb *const     *aiocblist;
 234         int                                             nent;
 235         const struct timespec   *timeoutp;
 236 };
 237
 238 struct  aio_write_args {
 239         struct aiocb                    *aiocbp;
 240 };
 241
 242 struct  lio_listio_args {
 243         int                                             mode;
 244         struct aiocb *const     *aiocblist;
 245         int                                             nent;
 246         struct sigevent                 *sigp;
 247 };
 248
 249 #endif /* _SYS_SYSPROTO_H_ */
 250
 251
 252 /*
 253  * aio_cancel - attempt to cancel one or more async IO requests currently
 254  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 255  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 256  * is NULL then all outstanding async IO request for the given file
 257  * descriptor are cancelled (if possible).
 258  */
 259
 260 int
 261 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
 262 {
 263         struct aiocb                            my_aiocb;
 264         int                                                     result;
 265         boolean_t                                       funnel_state;
 266
 267         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 268                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 269
 270         /* quick check to see if there are any async IO requests queued up */
 271         AIO_LOCK;
 272         result = aio_get_all_queues_count( );
 273         AIO_UNLOCK;
 274         if ( result < 1 ) {
 275                 result = EBADF;
 276                 goto ExitRoutine;
 277         }
 278
 279         *retval = -1;
 280         if ( uap->aiocbp != NULL ) {
 281                 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
 282                 if ( result != 0 ) {
 283                         result = EAGAIN;
 284                         goto ExitRoutine;
 285                 }
 286
 287                 /* NOTE - POSIX standard says a mismatch between the file */
 288                 /* descriptor passed in and the file descriptor embedded in */
 289                 /* the aiocb causes unspecified results.  We return EBADF in */
 290                 /* that situation.  */
 291                 if ( uap->fd != my_aiocb.aio_fildes ) {
 292                         result = EBADF;
 293                         goto ExitRoutine;
 294                 }
 295         }
 296
 297         /* current BSD code assumes funnel lock is held */
 298         funnel_state = thread_funnel_set( kernel_flock, TRUE );
 299         result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
 300         (void) thread_funnel_set( kernel_flock, funnel_state );
 301
 302         if ( result != -1 ) {
 303                 *retval = result;
 304                 result = 0;
 305                 goto ExitRoutine;
 306         }
 307
 308         result = EBADF;
 309
 310 ExitRoutine:
 311         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 312                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 313
 314         return( result );
 315
 316 } /* aio_cancel */
 317
 318
 319 /*
 320  * _aio_close - internal function used to clean up async IO requests for
 321  * a file descriptor that is closing.
 322  * NOTE - kernel funnel lock is held when we get called.
 323  * THIS MAY BLOCK.
 324  */
 325
 326 __private_extern__ void
 327 _aio_close( struct proc *p, int fd )
 328 {
 329         int                     error, count;
 330
 331         /* quick check to see if there are any async IO requests queued up */
 332         AIO_LOCK;
 333         count = aio_get_all_queues_count( );
 334         AIO_UNLOCK;
 335         if ( count < 1 )
 336                 return;
 337
 338         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 339                           (int)p, fd, 0, 0, 0 );
 340
 341         /* cancel all async IO requests on our todo queues for this file descriptor */
 342         error = do_aio_cancel( p, fd, NULL, TRUE, FALSE );
 343         if ( error == AIO_NOTCANCELED ) {
 344                 /*
 345                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 346                  * and file descriptor on the active async IO queue.  Active requests cannot
 347                  * be cancelled so we must wait for them to complete.  We will get a special
 348                  * wake up call on our channel used to sleep for ALL active requests to
 349                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 350                  * when we must wait for all active aio requests.
 351                  */
 352
 353                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 354                                   (int)p, fd, 0, 0, 0 );
 355
 356                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
 357         }
 358
 359         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 360                           (int)p, fd, 0, 0, 0 );
 361
 362         return;
 363
 364 } /* _aio_close */
 365
 366
 367 /*
 368  * aio_error - return the error status associated with the async IO
 369  * request referred to by uap->aiocbp.  The error status is the errno
 370  * value that would be set by the corresponding IO request (read, wrtie,
 371  * fdatasync, or sync).
 372  */
 373
 374 int
 375 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
 376 {
 377         aio_workq_entry                         *entryp;
 378         int                                                     error;
 379
 380         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 381                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 382
 383         AIO_LOCK;
 384
 385         /* quick check to see if there are any async IO requests queued up */
 386         if ( aio_get_all_queues_count( ) < 1 ) {
 387                 error = EINVAL;
 388                 goto ExitRoutine;
 389         }
 390
 391         /* look for a match on our queue of async IO requests that have completed */
 392         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 393                 if ( entryp->uaiocbp == uap->aiocbp ) {
 394                         *retval = entryp->errorval;
 395                         error = 0;
 396                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 397                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 398                         goto ExitRoutine;
 399                 }
 400         }
 401
 402         /* look for a match on our queue of active async IO requests */
 403         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 404                 if ( entryp->uaiocbp == uap->aiocbp ) {
 405                         *retval = EINPROGRESS;
 406                         error = 0;
 407                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 408                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 409                         goto ExitRoutine;
 410                 }
 411         }
 412
 413         /* look for a match on our queue of todo work */
 414         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 415                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 416                         *retval = EINPROGRESS;
 417                         error = 0;
 418                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
 419                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 420                         goto ExitRoutine;
 421                 }
 422         }
 423         error = EINVAL;
 424
 425 ExitRoutine:
 426         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 427                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 428         AIO_UNLOCK;
 429
 430         return( error );
 431
 432 } /* aio_error */
 433
 434
 435 /*
 436  * aio_fsync - asynchronously force all IO operations associated
 437  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 438  * queued at the time of the call to the synchronized completion state.
 439  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 440  * fdatasync() call.
 441  */
 442
 443 int
 444 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
 445 {
 446         int                     error;
 447         int                     fsync_kind;
 448
 449         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 450                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 451
 452         *retval = 0;
 453         if ( uap->op == O_SYNC )
 454                 fsync_kind = AIO_FSYNC;
 455 #if 0 // we don't support fdatasync() call yet
 456         else if ( uap->op == O_DSYNC )
 457                 fsync_kind = AIO_DSYNC;
 458 #endif
 459         else {
 460                 *retval = -1;
 461                 error = EINVAL;
 462                 goto ExitRoutine;
 463         }
 464
 465         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 466         if ( error != 0 )
 467                 *retval = -1;
 468
 469 ExitRoutine:
 470         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 471                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 472
 473         return( error );
 474
 475 } /* aio_fsync */
 476
 477
 478 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 479  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 480  * (uap->aiocbp->aio_buf).
 481  */
 482
 483 int
 484 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
 485 {
 486         int                     error;
 487
 488         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 489                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 490
 491         *retval = 0;
 492
 493         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 494         if ( error != 0 )
 495                 *retval = -1;
 496
 497         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 498                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 499
 500         return( error );
 501
 502 } /* aio_read */
 503
 504
 505 /*
 506  * aio_return - return the return status associated with the async IO
 507  * request referred to by uap->aiocbp.  The return status is the value
 508  * that would be returned by corresponding IO request (read, wrtie,
 509  * fdatasync, or sync).  This is where we release kernel resources
 510  * held for async IO call associated with the given aiocb pointer.
 511  */
 512
 513 int
 514 aio_return( struct proc *p, struct aio_return_args *uap, register_t *retval )
 515 {
 516         aio_workq_entry                         *entryp;
 517         int                                                     error;
 518         boolean_t                                       lock_held;
 519
 520         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 521                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 522
 523         AIO_LOCK;
 524         lock_held = TRUE;
 525         *retval = 0;
 526
 527         /* quick check to see if there are any async IO requests queued up */
 528         if ( aio_get_all_queues_count( ) < 1 ) {
 529                 error = EINVAL;
 530                 goto ExitRoutine;
 531         }
 532
 533         /* look for a match on our queue of async IO requests that have completed */
 534         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 535                 if ( entryp->uaiocbp == uap->aiocbp ) {
 536                         TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 537                         aio_anchor.aio_done_count--;
 538                         p->aio_done_count--;
 539
 540                         *retval = entryp->returnval;
 541
 542                         /* we cannot free requests that are still completing */
 543                         if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 544                                 vm_map_t                my_map;
 545
 546                                 my_map = entryp->aio_map;
 547                                 entryp->aio_map = VM_MAP_NULL;
 548                                 AIO_UNLOCK;
 549                                 lock_held = FALSE;
 550                                 aio_free_request( entryp, my_map );
 551                         }
 552                         else
 553                                 /* tell completion code to free this request */
 554                                 entryp->flags |= AIO_DO_FREE;
 555                         error = 0;
 556                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 557                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 558                         goto ExitRoutine;
 559                 }
 560         }
 561
 562         /* look for a match on our queue of active async IO requests */
 563         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 564                 if ( entryp->uaiocbp == uap->aiocbp ) {
 565                         error = EINPROGRESS;
 566                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 567                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 568                         goto ExitRoutine;
 569                 }
 570         }
 571
 572         /* look for a match on our queue of todo work */
 573         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 574                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 575                         error = EINPROGRESS;
 576                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
 577                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 578                         goto ExitRoutine;
 579                 }
 580         }
 581         error = EINVAL;
 582
 583 ExitRoutine:
 584         if ( lock_held )
 585                 AIO_UNLOCK;
 586         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 587                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 588
 589         return( error );
 590
 591 } /* aio_return */
 592
 593
 594 /*
 595  * _aio_exec - internal function used to clean up async IO requests for
 596  * a process that is going away due to exec().  We cancel any async IOs
 597  * we can and wait for those already active.  We also disable signaling
 598  * for cancelled or active aio requests that complete.
 599  * NOTE - kernel funnel lock is held when we get called.
 600  * This routine MAY block!
 601  */
 602
 603 __private_extern__ void
 604 _aio_exec( struct proc *p )
 605 {
 606
 607         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 608                           (int)p, 0, 0, 0, 0 );
 609
 610         _aio_exit( p );
 611
 612         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 613                           (int)p, 0, 0, 0, 0 );
 614
 615         return;
 616
 617 } /* _aio_exec */
 618
 619
 620 /*
 621  * _aio_exit - internal function used to clean up async IO requests for
 622  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 623  * we can and wait for those already active.  We also disable signaling
 624  * for cancelled or active aio requests that complete.  This routine MAY block!
 625  * NOTE - kernel funnel lock is held when we get called.
 626  */
 627
 628 __private_extern__ void
 629 _aio_exit( struct proc *p )
 630 {
 631         int                                             error, count;
 632         aio_workq_entry                 *entryp;
 633
 634         /* quick check to see if there are any async IO requests queued up */
 635         AIO_LOCK;
 636         count = aio_get_all_queues_count( );
 637         AIO_UNLOCK;
 638         if ( count < 1 ) {
 639                 return;
 640         }
 641
 642         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 643                           (int)p, 0, 0, 0, 0 );
 644
 645         /*
 646          * cancel async IO requests on the todo work queue and wait for those
 647          * already active to complete.
 648          */
 649         error = do_aio_cancel( p, 0, NULL, TRUE, TRUE );
 650         if ( error == AIO_NOTCANCELED ) {
 651                 /*
 652                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 653                  * on the active async IO queue.  Active requests cannot be cancelled so we
 654                  * must wait for them to complete.  We will get a special wake up call on
 655                  * our channel used to sleep for ALL active requests to complete.  This sleep
 656                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 657                  * active aio requests.
 658                  */
 659
 660                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 661                                   (int)p, 0, 0, 0, 0 );
 662
 663                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
 664         }
 665
 666         /* release all aio resources used by this process */
 667         AIO_LOCK;
 668         entryp = TAILQ_FIRST( &p->aio_doneq );
 669         while ( entryp != NULL ) {
 670                 aio_workq_entry                 *next_entryp;
 671
 672                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 673                 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 674                 aio_anchor.aio_done_count--;
 675                 p->aio_done_count--;
 676
 677                 /* we cannot free requests that are still completing */
 678                 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 679                         vm_map_t                my_map;
 680
 681                         my_map = entryp->aio_map;
 682                         entryp->aio_map = VM_MAP_NULL;
 683                         AIO_UNLOCK;
 684                         aio_free_request( entryp, my_map );
 685
 686                         /* need to start over since aio_doneq may have been */
 687                         /* changed while we were away.  */
 688                         AIO_LOCK;
 689                         entryp = TAILQ_FIRST( &p->aio_doneq );
 690                         continue;
 691                 }
 692                 else
 693                         /* tell completion code to free this request */
 694                         entryp->flags |= AIO_DO_FREE;
 695                 entryp = next_entryp;
 696         }
 697         AIO_UNLOCK;
 698
 699 ExitRoutine:
 700         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 701                           (int)p, 0, 0, 0, 0 );
 702
 703         return;
 704
 705 } /* _aio_exit */
 706
 707
 708 /*
 709  * do_aio_cancel - cancel async IO requests (if possible).  We get called by
 710  * aio_cancel, close, and at exit.
 711  * There are three modes of operation: 1) cancel all async IOs for a process -
 712  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 713  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 714  * aiocbp.
 715  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 716  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 717  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 718  * were already complete.
 719  * WARNING - do not deference aiocbp in this routine, it may point to user
 720  * land data that has not been copied in (when called from aio_cancel() )
 721  * NOTE - kernel funnel lock is held when we get called.
 722  */
 723
 724 static int
 725 do_aio_cancel(  struct proc *p, int fd, struct aiocb *aiocbp,
 726                                 boolean_t wait_for_completion, boolean_t disable_notification )
 727 {
 728         aio_workq_entry                 *entryp;
 729         int                                             result;
 730
 731         result = -1;
 732
 733         /* look for a match on our queue of async todo work. */
 734         AIO_LOCK;
 735         entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 736         while ( entryp != NULL ) {
 737                 aio_workq_entry                 *next_entryp;
 738
 739                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 740                 if ( p == entryp->procp ) {
 741                         if ( (aiocbp == NULL && fd == 0) ||
 742                                  (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 743                                  (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 744                                 /* we found a match so we remove the entry from the */
 745                                 /* todo work queue and place it on the done queue */
 746                                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 747                                 aio_anchor.aio_async_workq_count--;
 748                                 entryp->errorval = ECANCELED;
 749                                 entryp->returnval = -1;
 750                                 if ( disable_notification )
 751                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 752                                 result = AIO_CANCELED;
 753
 754                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
 755                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 756
 757                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 758                                 aio_anchor.aio_done_count++;
 759                                 p->aio_done_count++;
 760                                 entryp->flags |= AIO_COMPLETION;
 761                                 AIO_UNLOCK;
 762
 763                                 /* do completion processing for this request */
 764                                 do_aio_completion( entryp );
 765
 766                                 AIO_LOCK;
 767                                 entryp->flags &= ~AIO_COMPLETION;
 768                                 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
 769                                         vm_map_t                my_map;
 770
 771                                         my_map = entryp->aio_map;
 772                                         entryp->aio_map = VM_MAP_NULL;
 773                                         AIO_UNLOCK;
 774                                         aio_free_request( entryp, my_map );
 775                                 }
 776                                 else
 777                                         AIO_UNLOCK;
 778
 779                                 if ( aiocbp != NULL ) {
 780                                         return( result );
 781                                 }
 782
 783                                 /* need to start over since aio_async_workq may have been */
 784                                 /* changed while we were away doing completion processing.  */
 785                                 AIO_LOCK;
 786                                 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 787                                 continue;
 788                         }
 789                 }
 790                 entryp = next_entryp;
 791         } /* while... */
 792
 793         /*
 794          * look for a match on our queue of synchronous todo work.  This will
 795          * be a rare occurrence but could happen if a process is terminated while
 796          * processing a lio_listio call.
 797          */
 798         entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 799         while ( entryp != NULL ) {
 800                 aio_workq_entry                 *next_entryp;
 801
 802                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 803                 if ( p == entryp->procp ) {
 804                         if ( (aiocbp == NULL && fd == 0) ||
 805                                  (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 806                                  (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 807                                 /* we found a match so we remove the entry from the */
 808                                 /* todo work queue and place it on the done queue */
 809                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 810                                 aio_anchor.lio_sync_workq_count--;
 811                                 entryp->errorval = ECANCELED;
 812                                 entryp->returnval = -1;
 813                                 if ( disable_notification )
 814                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 815                                 result = AIO_CANCELED;
 816
 817                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
 818                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 819
 820                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 821                                 aio_anchor.aio_done_count++;
 822                                 p->aio_done_count++;
 823                                 if ( aiocbp != NULL ) {
 824                                         AIO_UNLOCK;
 825                                         return( result );
 826                                 }
 827                         }
 828                 }
 829                 entryp = next_entryp;
 830         } /* while... */
 831
 832         /*
 833          * look for a match on our queue of active async IO requests and
 834          * return AIO_NOTCANCELED result.
 835          */
 836         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 837                 if ( (aiocbp == NULL && fd == 0) ||
 838                          (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 839                          (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 840                         result = AIO_NOTCANCELED;
 841
 842                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
 843                                                   (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 844
 845                         if ( wait_for_completion )
 846                                 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
 847                         if ( disable_notification )
 848                                 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 849                         if ( aiocbp != NULL ) {
 850                                 AIO_UNLOCK;
 851                                 return( result );
 852                         }
 853                 }
 854         }
 855
 856         /*
 857          * if we didn't find any matches on the todo or active queues then look for a
 858          * match on our queue of async IO requests that have completed and if found
 859          * return AIO_ALLDONE result.
 860          */
 861         if ( result == -1 ) {
 862                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 863                 if ( (aiocbp == NULL && fd == 0) ||
 864                          (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
 865                          (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
 866                                 result = AIO_ALLDONE;
 867
 868                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
 869                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 870
 871                                 if ( aiocbp != NULL ) {
 872                                         AIO_UNLOCK;
 873                                         return( result );
 874                                 }
 875                         }
 876                 }
 877         }
 878         AIO_UNLOCK;
 879
 880         return( result );
 881
 882 } /* do_aio_cancel */
 883
 884
 885 /*
 886  * aio_suspend - suspend the calling thread until at least one of the async
 887  * IO operations referenced by uap->aiocblist has completed, until a signal
 888  * interrupts the function, or uap->timeoutp time interval (optional) has
 889  * passed.
 890  * Returns 0 if one or more async IOs have completed else -1 and errno is
 891  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
 892  * woke us up.
 893  */
 894
 895 int
 896 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
 897 {
 898         int                                     error;
 899         int                                     i, count;
 900         uint64_t                        abstime;
 901         struct timespec         ts;
 902         struct timeval          tv;
 903         aio_workq_entry         *entryp;
 904         struct aiocb *          *aiocbpp;
 905
 906         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
 907                           (int)p, uap->nent, 0, 0, 0 );
 908
 909         *retval = -1;
 910         abstime = 0;
 911         aiocbpp = NULL;
 912
 913         /* quick check to see if there are any async IO requests queued up */
 914         AIO_LOCK;
 915         count = aio_get_all_queues_count( );
 916         AIO_UNLOCK;
 917         if ( count < 1 ) {
 918                 error = EINVAL;
 919                 goto ExitThisRoutine;
 920         }
 921
 922         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
 923                 error = EINVAL;
 924                 goto ExitThisRoutine;
 925         }
 926
 927         if ( uap->timeoutp != NULL ) {
 928                 error = copyin( (void *)uap->timeoutp, &ts, sizeof(ts) );
 929                 if ( error != 0 ) {
 930                         error = EAGAIN;
 931                         goto ExitThisRoutine;
 932                 }
 933
 934                 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
 935                         error = EINVAL;
 936                         goto ExitThisRoutine;
 937                 }
 938
 939                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
 940                                                                          &abstime );
 941                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
 942         }
 943
 944         MALLOC( aiocbpp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
 945         if ( aiocbpp == NULL ) {
 946                 error = EAGAIN;
 947                 goto ExitThisRoutine;
 948         }
 949
 950         /* check list of aio requests to see if any have completed */
 951         for ( i = 0; i < uap->nent; i++ ) {
 952                 struct aiocb    *aiocbp;
 953
 954                 /* copyin in aiocb pointer from list */
 955                 error = copyin( (void *)(uap->aiocblist + i), (aiocbpp + i), sizeof(aiocbp) );
 956                 if ( error != 0 ) {
 957                         error = EAGAIN;
 958                         goto ExitThisRoutine;
 959                 }
 960
 961                 /* NULL elements are legal so check for 'em */
 962                 aiocbp = *(aiocbpp + i);
 963                 if ( aiocbp == NULL )
 964                         continue;
 965
 966                 /* return immediately if any aio request in the list is done */
 967                 AIO_LOCK;
 968                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 969                         if ( entryp->uaiocbp == aiocbp ) {
 970                                 *retval = 0;
 971                                 error = 0;
 972                                 AIO_UNLOCK;
 973                                 goto ExitThisRoutine;
 974                         }
 975                 }
 976                 AIO_UNLOCK;
 977         } /* for ( ; i < uap->nent; ) */
 978
 979         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
 980                           (int)p, uap->nent, 0, 0, 0 );
 981
 982         /*
 983          * wait for an async IO to complete or a signal fires or timeout expires.
 984          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
 985          * interrupts us.  If an async IO completes before a signal fires or our
 986          * timeout expires, we get a wakeup call from aio_work_thread().  We do not
 987          * use tsleep() here in order to avoid getting kernel funnel lock.
 988          */
 989         assert_wait( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE );
 990         if ( abstime > 0 ) {
 991                 thread_set_timer_deadline( abstime );
 992         }
 993         error = thread_block( THREAD_CONTINUE_NULL );
 994         if ( error == THREAD_AWAKENED ) {
 995                 /* got our wakeup call from aio_work_thread() */
 996                 if ( abstime > 0 ) {
 997                         thread_cancel_timer();
 998                 }
 999                 *retval = 0;
1000                 error = 0;
1001         }
1002         else if ( error == THREAD_TIMED_OUT ) {
1003                 /* our timeout expired */
1004                 error = EAGAIN;
1005         }
1006         else {
1007                 /* we were interrupted */
1008                 if ( abstime > 0 ) {
1009                         thread_cancel_timer();
1010                 }
1011                 error = EINTR;
1012         }
1013
1014 ExitThisRoutine:
1015         if ( aiocbpp != NULL )
1016                 FREE( aiocbpp, M_TEMP );
1017
1018         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1019                           (int)p, uap->nent, error, 0, 0 );
1020
1021         return( error );
1022
1023 } /* aio_suspend */
1024
1025
1026 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1027  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1028  * (uap->aiocbp->aio_buf).
1029  */
1030
1031 int
1032 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1033 {
1034         int                     error;
1035
1036         *retval = 0;
1037
1038         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1039                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1040
1041         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1042         if ( error != 0 )
1043                 *retval = -1;
1044
1045         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1046                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1047
1048         return( error );
1049
1050 } /* aio_write */
1051
1052
1053 /*
1054  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs
1055  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1056  * The caller gets error and return status for each aiocb in the list via aio_error
1057  * and aio_return.  We must keep completed requests until released by the
1058  * aio_return call.
1059  */
1060
1061 int
1062 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1063 {
1064         int                                                     i;
1065         int                                                     call_result;
1066         int                                                     result;
1067         long                                            group_tag;
1068         aio_workq_entry *                       *entryp_listp;
1069
1070         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1071                           (int)p, uap->nent, uap->mode, 0, 0 );
1072
1073         entryp_listp = NULL;
1074         call_result = -1;
1075         *retval = -1;
1076         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1077                 call_result = EINVAL;
1078                 goto ExitRoutine;
1079         }
1080
1081         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1082                 call_result = EINVAL;
1083                 goto ExitRoutine;
1084         }
1085
1086         /*
1087          * we use group_tag to mark IO requests for delayed completion processing
1088          * which means we wait until all IO requests in the group have completed
1089          * before we either return to the caller when mode is LIO_WAIT or signal
1090          * user when mode is LIO_NOWAIT.
1091          */
1092         group_tag = random();
1093
1094         /*
1095          * allocate a list of aio_workq_entry pointers that we will use to queue
1096          * up all our requests at once while holding our lock.
1097          */
1098         MALLOC( entryp_listp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
1099         if ( entryp_listp == NULL ) {
1100                 call_result = EAGAIN;
1101                 goto ExitRoutine;
1102         }
1103
1104         /* process list of aio requests */
1105         for ( i = 0; i < uap->nent; i++ ) {
1106                 struct aiocb    *my_aiocbp;
1107
1108                 *(entryp_listp + i) = NULL;
1109
1110                 /* copyin in aiocb pointer from list */
1111                 result = copyin( (void *)(uap->aiocblist + i), &my_aiocbp, sizeof(my_aiocbp) );
1112                 if ( result != 0 ) {
1113                         call_result = EAGAIN;
1114                         continue;
1115                 }
1116
1117                 /* NULL elements are legal so check for 'em */
1118                 if ( my_aiocbp == NULL )
1119                         continue;
1120
1121                 if ( uap->mode == LIO_NOWAIT )
1122                         result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1123                                                                                          group_tag, (entryp_listp + i) );
1124                 else
1125                         result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1126                                                                                         (entryp_listp + i) );
1127
1128                 if ( result != 0 && call_result == -1 )
1129                         call_result = result;
1130         }
1131
1132         /*
1133          * we need to protect this section since we do not want any of these grouped
1134          * IO requests to begin until we have them all on the queue.
1135          */
1136         AIO_LOCK;
1137         for ( i = 0; i < uap->nent; i++ ) {
1138                 aio_workq_entry                         *entryp;
1139
1140                 /* NULL elements are legal so check for 'em */
1141                 entryp = *(entryp_listp + i);
1142                 if ( entryp == NULL )
1143                         continue;
1144
1145                 /* check our aio limits to throttle bad or rude user land behavior */
1146                 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1147                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1148                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1149                         vm_map_t                my_map;
1150
1151                         my_map = entryp->aio_map;
1152                         entryp->aio_map = VM_MAP_NULL;
1153                         result = EAGAIN;
1154                         AIO_UNLOCK;
1155                         aio_free_request( entryp, my_map );
1156                         AIO_LOCK;
1157                         continue;
1158                 }
1159
1160                 /* place the request on the appropriate queue */
1161                 if ( uap->mode == LIO_NOWAIT ) {
1162                         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1163                         aio_anchor.aio_async_workq_count++;
1164
1165                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1166                                           (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1167                 }
1168                 else {
1169                         TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1170                         aio_anchor.lio_sync_workq_count++;
1171                 }
1172         }
1173         AIO_UNLOCK;
1174
1175         if ( uap->mode == LIO_NOWAIT )
1176                 /* caller does not want to wait so we'll fire off a worker thread and return */
1177                 wakeup_one( &aio_anchor.aio_async_workq );
1178         else {
1179                 aio_workq_entry                 *entryp;
1180                 int                                     error;
1181
1182                 /*
1183                  * mode is LIO_WAIT - handle the IO requests now.
1184                  */
1185                 AIO_LOCK;
1186                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1187                 while ( entryp != NULL ) {
1188                         if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1189                                 boolean_t       funnel_state;
1190
1191                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1192                                 aio_anchor.lio_sync_workq_count--;
1193                                 AIO_UNLOCK;
1194
1195                                 // file system IO code path requires kernel funnel lock
1196                                 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1197                                 if ( (entryp->flags & AIO_READ) != 0 ) {
1198                                         error = do_aio_read( entryp );
1199                                 }
1200                                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1201                                         error = do_aio_write( entryp );
1202                                 }
1203                                 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1204                                         error = do_aio_fsync( entryp );
1205                                 }
1206                                 else {
1207                                         printf( "%s - unknown aio request - flags 0x%02X \n",
1208                                                         __FUNCTION__, entryp->flags );
1209                                         error = EINVAL;
1210                                 }
1211                                 entryp->errorval = error;
1212                                 if ( error != 0 && call_result == -1 )
1213                                         call_result = EIO;
1214                                 (void) thread_funnel_set( kernel_flock, funnel_state );
1215
1216                                 AIO_LOCK;
1217                                 /* we're done with the IO request so move it on the done queue */
1218                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1219                                 aio_anchor.aio_done_count++;
1220                                 p->aio_done_count++;
1221
1222                                 /* need to start over since lio_sync_workq may have been changed while we */
1223                                 /* were away doing the IO.  */
1224                                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1225                                 continue;
1226                         } /* p == entryp->procp */
1227
1228                         entryp = TAILQ_NEXT( entryp, aio_workq_link );
1229         } /* while ( entryp != NULL ) */
1230                 AIO_UNLOCK;
1231         } /* uap->mode == LIO_WAIT */
1232
1233         /* call_result == -1 means we had no trouble queueing up requests */
1234         if ( call_result == -1 ) {
1235                 call_result = 0;
1236                 *retval = 0;
1237         }
1238
1239 ExitRoutine:
1240         if ( entryp_listp != NULL )
1241                 FREE( entryp_listp, M_TEMP );
1242
1243         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1244                           (int)p, call_result, 0, 0, 0 );
1245
1246         return( call_result );
1247
1248 } /* lio_listio */
1249
1250
1251 /*
1252  * aio worker thread.  this is where all the real work gets done.
1253  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1254  * after new work is queued up.
1255  */
1256
1257 static void
1258 aio_work_thread( void )
1259 {
1260         aio_workq_entry                 *entryp;
1261         struct uthread                  *uthread = (struct uthread *)get_bsdthread_info(current_act());
1262
1263         for( ;; ) {
1264                 entryp = aio_get_some_work();
1265         if ( entryp == NULL ) {
1266                 /*
1267                  * aio worker threads wait for some work to get queued up
1268                  * by aio_queue_async_request.  Once some work gets queued
1269                  * it will wake up one of these worker threads just before
1270                  * returning to our caller in user land.   We do not use
1271                          * tsleep() here in order to avoid getting kernel funnel lock.
1272                  */
1273                         assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1274                         thread_block( THREAD_CONTINUE_NULL );
1275
1276                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_wake)) | DBG_FUNC_NONE,
1277                                                   0, 0, 0, 0, 0 );
1278         }
1279                 else {
1280                         int                     error;
1281                         boolean_t               funnel_state;
1282                         vm_map_t                currentmap;
1283                         vm_map_t                oldmap = VM_MAP_NULL;
1284                         task_t                  oldaiotask = TASK_NULL;
1285
1286                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1287                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1288
1289                         /*
1290                          * Assume the target's address space identity for the duration
1291                          * of the IO.
1292                          */
1293                         funnel_state = thread_funnel_set( kernel_flock, TRUE );
1294
1295                         currentmap = get_task_map( (current_proc())->task );
1296                         if ( currentmap != entryp->aio_map ) {
1297                                 oldaiotask = uthread->uu_aio_task;
1298                                 uthread->uu_aio_task = entryp->procp->task;
1299                                 oldmap = vm_map_switch( entryp->aio_map );
1300                         }
1301
1302                         if ( (entryp->flags & AIO_READ) != 0 ) {
1303                                 error = do_aio_read( entryp );
1304                         }
1305                         else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1306                                 error = do_aio_write( entryp );
1307                         }
1308                         else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1309                                 error = do_aio_fsync( entryp );
1310                         }
1311                         else {
1312                                 printf( "%s - unknown aio request - flags 0x%02X \n",
1313                                                 __FUNCTION__, entryp->flags );
1314                                 error = EINVAL;
1315                         }
1316                         entryp->errorval = error;
1317                         if ( currentmap != entryp->aio_map ) {
1318                                 (void) vm_map_switch( oldmap );
1319                                 uthread->uu_aio_task = oldaiotask;
1320                         }
1321
1322                         /* we're done with the IO request so pop it off the active queue and */
1323                         /* push it on the done queue */
1324                         AIO_LOCK;
1325                         TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1326                         aio_anchor.aio_active_count--;
1327                         entryp->procp->aio_active_count--;
1328                         TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1329                         aio_anchor.aio_done_count++;
1330                         entryp->procp->aio_done_count++;
1331                         entryp->flags |= AIO_COMPLETION;
1332
1333                         /* remove our reference to the user land map. */
1334                         if ( VM_MAP_NULL != entryp->aio_map ) {
1335                                 vm_map_t                my_map;
1336
1337                                 my_map = entryp->aio_map;
1338                                 entryp->aio_map = VM_MAP_NULL;
1339                                 AIO_UNLOCK;  /* must unlock before calling vm_map_deallocate() */
1340                                 vm_map_deallocate( my_map );
1341                         }
1342                         else {
1343                                 AIO_UNLOCK;
1344                         }
1345
1346                         do_aio_completion( entryp );
1347                         (void) thread_funnel_set( kernel_flock, funnel_state );
1348
1349                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1350                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1351                                                   entryp->returnval, 0 );
1352
1353                         AIO_LOCK;
1354                         entryp->flags &= ~AIO_COMPLETION;
1355                         if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1356                                 vm_map_t                my_map;
1357
1358                                 my_map = entryp->aio_map;
1359                                 entryp->aio_map = VM_MAP_NULL;
1360                                 AIO_UNLOCK;
1361                                 aio_free_request( entryp, my_map );
1362                         }
1363                         else
1364                                 AIO_UNLOCK;
1365                 }
1366         } /* for ( ;; ) */
1367
1368         /* NOT REACHED */
1369
1370 } /* aio_work_thread */
1371
1372
1373 /*
1374  * aio_get_some_work - get the next async IO request that is ready to be executed.
1375  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1376  * IO requests at the time the aio_fsync call came in have completed.
1377  */
1378
1379 static aio_workq_entry *
1380 aio_get_some_work( void )
1381 {
1382         aio_workq_entry                         *entryp;
1383         int                                                     skip_count = 0;
1384
1385         /* pop some work off the work queue and add to our active queue */
1386         AIO_LOCK;
1387         for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1388                   entryp != NULL;
1389                   entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1390
1391                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1392                         /* leave aio_fsync calls on the work queue if there are IO */
1393                         /* requests on the active queue for the same file descriptor. */
1394                         if ( aio_delay_fsync_request( entryp ) ) {
1395
1396                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1397                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1398                                 continue;
1399                         }
1400                 }
1401                 break;
1402         }
1403
1404         if ( entryp != NULL ) {
1405                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1406                 aio_anchor.aio_async_workq_count--;
1407                 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1408                 aio_anchor.aio_active_count++;
1409                 entryp->procp->aio_active_count++;
1410         }
1411         AIO_UNLOCK;
1412
1413         return( entryp );
1414
1415 } /* aio_get_some_work */
1416
1417
1418 /*
1419  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1420  * this time.  Delay will happen when there are any active IOs for the same file
1421  * descriptor that were queued at time the aio_sync call was queued.
1422  * NOTE - AIO_LOCK must be held by caller
1423  */
1424 static boolean_t
1425 aio_delay_fsync_request( aio_workq_entry *entryp )
1426 {
1427         aio_workq_entry                 *my_entryp;
1428
1429         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1430                 if ( my_entryp->fsyncp != NULL &&
1431                          entryp->uaiocbp == my_entryp->fsyncp &&
1432                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1433                         return( TRUE );
1434                 }
1435         }
1436
1437         return( FALSE );
1438
1439 } /* aio_delay_fsync_request */
1440
1441
1442 /*
1443  * aio_queue_async_request - queue up an async IO request on our work queue then
1444  * wake up one of our worker threads to do the actual work.  We get a reference
1445  * to our caller's user land map in order to keep it around while we are
1446  * processing the request.
1447  */
1448
1449 static int
1450 aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO )
1451 {
1452         aio_workq_entry                 *entryp;
1453         int                                             result;
1454
1455         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1456         if ( entryp == NULL ) {
1457                 result = EAGAIN;
1458                 goto error_exit;
1459         }
1460         bzero( entryp, sizeof(*entryp) );
1461
1462         /* fill in the rest of the aio_workq_entry */
1463         entryp->procp = procp;
1464         entryp->uaiocbp = aiocbp;
1465         entryp->flags |= kindOfIO;
1466         entryp->aio_map = VM_MAP_NULL;
1467         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1468         if ( result != 0 ) {
1469                 result = EAGAIN;
1470                 goto error_exit;
1471         }
1472
1473         /* do some more validation on the aiocb and embedded file descriptor */
1474         result = aio_validate( entryp );
1475         if ( result != 0 )
1476                 goto error_exit;
1477
1478         /* get a reference to the user land map in order to keep it around */
1479         entryp->aio_map = get_task_map( procp->task );
1480         vm_map_reference( entryp->aio_map );
1481
1482         AIO_LOCK;
1483
1484         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1485                 AIO_UNLOCK;
1486                 result = EAGAIN;
1487                 goto error_exit;
1488         }
1489
1490         /* check our aio limits to throttle bad or rude user land behavior */
1491         if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1492                  aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1493                 AIO_UNLOCK;
1494                 result = EAGAIN;
1495                 goto error_exit;
1496         }
1497
1498         /*
1499          * aio_fsync calls sync up all async IO requests queued at the time
1500          * the aio_fsync call was made.  So we mark each currently queued async
1501          * IO with a matching file descriptor as must complete before we do the
1502          * fsync.  We set the fsyncp field of each matching async IO
1503          * request with the aiocb pointer passed in on the aio_fsync call to
1504          * know which IOs must complete before we process the aio_fsync call.
1505          */
1506         if ( (kindOfIO & AIO_FSYNC) != 0 )
1507                 aio_mark_requests( entryp );
1508
1509         /* queue up on our aio asynchronous work queue */
1510         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1511         aio_anchor.aio_async_workq_count++;
1512
1513         AIO_UNLOCK;
1514
1515         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1516                           (int)procp, (int)aiocbp, 0, 0, 0 );
1517
1518         wakeup_one( &aio_anchor.aio_async_workq );
1519
1520         return( 0 );
1521
1522 error_exit:
1523         if ( entryp != NULL ) {
1524                 /* this entry has not been queued up so no worries about unlocked */
1525                 /* state and aio_map */
1526                 aio_free_request( entryp, entryp->aio_map );
1527         }
1528
1529         return( result );
1530
1531 } /* aio_queue_async_request */
1532
1533
1534 /*
1535  * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1536  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1537  * our caller.  We get a reference to our caller's user land map in order to keep
1538  * it around while we are processing the request.
1539  * lio_listio calls behave differently at completion they do completion notification
1540  * when all async IO requests have completed.  We use group_tag to tag IO requests
1541  * that behave in the delay notification manner.
1542  */
1543
1544 static int
1545 lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp,
1546                                                  struct sigevent *sigp, long group_tag,
1547                                                  aio_workq_entry **entrypp )
1548 {
1549         aio_workq_entry                         *entryp;
1550         int                                                     result;
1551
1552         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1553         if ( entryp == NULL ) {
1554                 result = EAGAIN;
1555                 goto error_exit;
1556         }
1557         bzero( entryp, sizeof(*entryp) );
1558
1559         /* fill in the rest of the aio_workq_entry */
1560         entryp->procp = procp;
1561         entryp->uaiocbp = aiocbp;
1562         entryp->flags |= AIO_LIO;
1563         entryp->group_tag = group_tag;
1564         entryp->aio_map = VM_MAP_NULL;
1565         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1566         if ( result != 0 ) {
1567                 result = EAGAIN;
1568                 goto error_exit;
1569         }
1570
1571         /* look for lio_listio LIO_NOP requests and ignore them. */
1572         /* Not really an error, but we need to free our aio_workq_entry.  */
1573         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1574                 result = 0;
1575                 goto error_exit;
1576         }
1577
1578         /* use sigevent passed in to lio_listio for each of our calls, but only */
1579         /* do completion notification after the last request completes. */
1580         if ( sigp != NULL ) {
1581                 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1582                 if ( result != 0 ) {
1583                         result = EAGAIN;
1584                         goto error_exit;
1585                 }
1586         }
1587
1588         /* do some more validation on the aiocb and embedded file descriptor */
1589         result = aio_validate( entryp );
1590         if ( result != 0 )
1591                 goto error_exit;
1592
1593         /* get a reference to the user land map in order to keep it around */
1594         entryp->aio_map = get_task_map( procp->task );
1595         vm_map_reference( entryp->aio_map );
1596
1597         *entrypp = entryp;
1598         return( 0 );
1599
1600 error_exit:
1601         if ( entryp != NULL )
1602                 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1603
1604         return( result );
1605
1606 } /* lio_create_async_entry */
1607
1608
1609 /*
1610  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1611  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp
1612  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp
1613  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1614  * NOTE - AIO_LOCK must be held by caller
1615  */
1616
1617 static void
1618 aio_mark_requests( aio_workq_entry *entryp )
1619 {
1620         aio_workq_entry                 *my_entryp;
1621
1622         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1623                 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1624                         my_entryp->fsyncp = entryp->uaiocbp;
1625                 }
1626         }
1627
1628         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1629                 if ( entryp->procp == my_entryp->procp &&
1630                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1631                         my_entryp->fsyncp = entryp->uaiocbp;
1632                 }
1633         }
1634
1635 } /* aio_mark_requests */
1636
1637
1638 /*
1639  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1640  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1641  * our caller.
1642  * lio_listio calls behave differently at completion they do completion notification
1643  * when all async IO requests have completed.  We use group_tag to tag IO requests
1644  * that behave in the delay notification manner.
1645  */
1646
1647 static int
1648 lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp,
1649                                                 long group_tag, aio_workq_entry **entrypp )
1650 {
1651         aio_workq_entry                         *entryp;
1652         int                                                     result;
1653
1654         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1655         if ( entryp == NULL ) {
1656                 result = EAGAIN;
1657                 goto error_exit;
1658         }
1659         bzero( entryp, sizeof(*entryp) );
1660
1661         /* fill in the rest of the aio_workq_entry */
1662         entryp->procp = procp;
1663         entryp->uaiocbp = aiocbp;
1664         entryp->flags |= AIO_LIO;
1665         entryp->group_tag = group_tag;
1666         entryp->aio_map = VM_MAP_NULL;
1667         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1668         if ( result != 0 ) {
1669                 result = EAGAIN;
1670                 goto error_exit;
1671         }
1672
1673         /* look for lio_listio LIO_NOP requests and ignore them. */
1674         /* Not really an error, but we need to free our aio_workq_entry.  */
1675         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1676                 result = 0;
1677                 goto error_exit;
1678         }
1679
1680         result = aio_validate( entryp );
1681         if ( result != 0 ) {
1682                 goto error_exit;
1683         }
1684
1685         *entrypp = entryp;
1686         return( 0 );
1687
1688 error_exit:
1689         if ( entryp != NULL )
1690                 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1691
1692         return( result );
1693
1694 } /* lio_create_sync_entry */
1695
1696
1697 /*
1698  * aio_free_request - remove our reference on the user land map and
1699  * free the work queue entry resources.
1700  * We are not holding the lock here thus aio_map is passed in and
1701  * zeroed while we did have the lock.
1702  */
1703
1704 static int
1705 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1706 {
1707         /* remove our reference to the user land map. */
1708         if ( VM_MAP_NULL != the_map ) {
1709                 vm_map_deallocate( the_map );
1710         }
1711
1712         zfree( aio_workq_zonep, (vm_offset_t) entryp );
1713
1714         return( 0 );
1715
1716 } /* aio_free_request */
1717
1718
1719 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1720  */
1721
1722 static int
1723 aio_validate( aio_workq_entry *entryp )
1724 {
1725         boolean_t                                       funnel_state;
1726         struct file                             *fp;
1727         int                                                     flag;
1728         int                                                     result;
1729
1730         result = 0;
1731
1732         if ( (entryp->flags & AIO_LIO) != 0 ) {
1733                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1734                         entryp->flags |= AIO_READ;
1735                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1736                         entryp->flags |= AIO_WRITE;
1737                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1738                         return( 0 );
1739                 else
1740                         return( EINVAL );
1741         }
1742
1743         flag = FREAD;
1744         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1745                 flag = FWRITE;
1746         }
1747
1748         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1749                 if ( entryp->aiocb.aio_offset < 0                       ||
1750                          entryp->aiocb.aio_nbytes < 0                   ||
1751                          entryp->aiocb.aio_nbytes > INT_MAX     ||
1752                          entryp->aiocb.aio_buf == NULL )
1753                         return( EINVAL );
1754         }
1755
1756         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify
1757          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value,
1758          * sigev_notify_function, and sigev_notify_attributes are ignored.
1759          */
1760         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1761                 int             signum;
1762                 /* make sure we have a valid signal number */
1763                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1764                 if ( signum <= 0 || signum >= NSIG ||
1765                          signum == SIGKILL || signum == SIGSTOP )
1766                         return (EINVAL);
1767         }
1768         else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1769                 return (EINVAL);
1770
1771         /* validate the file descriptor and that the file was opened
1772          * for the appropriate read / write access.  This section requires
1773          * kernel funnel lock.
1774          */
1775         funnel_state = thread_funnel_set( kernel_flock, TRUE );
1776
1777         result = fdgetf( entryp->procp, entryp->aiocb.aio_fildes, &fp );
1778         if ( result == 0 ) {
1779                 if ( (fp->f_flag & flag) == 0 ) {
1780                         /* we don't have read or write access */
1781                         result = EBADF;
1782                 }
1783                 else if ( fp->f_type != DTYPE_VNODE ) {
1784                         /* this is not a file */
1785                         result = ESPIPE;
1786                 }
1787         }
1788         else {
1789                 result = EBADF;
1790         }
1791
1792         (void) thread_funnel_set( kernel_flock, funnel_state );
1793
1794         return( result );
1795
1796 } /* aio_validate */
1797
1798
1799 /*
1800  * aio_get_process_count - runs through our queues that hold outstanding
1801  * async IO reqests and totals up number of requests for the given
1802  * process.
1803  * NOTE - caller must hold aio lock!
1804  */
1805
1806 static int
1807 aio_get_process_count( struct proc *procp )
1808 {
1809         aio_workq_entry                         *entryp;
1810         int                                                     error;
1811         int                                                     count;
1812
1813         /* begin with count of completed async IO requests for this process */
1814         count = procp->aio_done_count;
1815
1816         /* add in count of active async IO requests for this process */
1817         count += procp->aio_active_count;
1818
1819         /* look for matches on our queue of asynchronous todo work */
1820         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1821                 if ( procp == entryp->procp ) {
1822                         count++;
1823                 }
1824         }
1825
1826         /* look for matches on our queue of synchronous todo work */
1827         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1828                 if ( procp == entryp->procp ) {
1829                         count++;
1830                 }
1831         }
1832
1833         return( count );
1834
1835 } /* aio_get_process_count */
1836
1837
1838 /*
1839  * aio_get_all_queues_count - get total number of entries on all aio work queues.
1840  * NOTE - caller must hold aio lock!
1841  */
1842
1843 static int
1844 aio_get_all_queues_count( void )
1845 {
1846         int                                                     count;
1847
1848         count = aio_anchor.aio_async_workq_count;
1849         count += aio_anchor.lio_sync_workq_count;
1850         count += aio_anchor.aio_active_count;
1851         count += aio_anchor.aio_done_count;
1852
1853         return( count );
1854
1855 } /* aio_get_all_queues_count */
1856
1857
1858 /*
1859  * do_aio_completion.  Handle async IO completion.
1860  */
1861
1862 static void
1863 do_aio_completion( aio_workq_entry *entryp )
1864 {
1865         /* signal user land process if appropriate */
1866         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1867                  (entryp->flags & AIO_DISABLE) == 0 ) {
1868
1869                 /*
1870                  * if group_tag is non zero then make sure this is the last IO request
1871                  * in the group before we signal.
1872                  */
1873                 if ( entryp->group_tag == 0 ||
1874                          (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1875                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1876                                                   (int)entryp->procp, (int)entryp->uaiocbp,
1877                                                   entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1878
1879                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1880                         return;
1881                 }
1882         }
1883
1884         /*
1885          * need to handle case where a process is trying to exit, exec, or close
1886          * and is currently waiting for active aio requests to complete.  If
1887          * AIO_WAITING is set then we need to look to see if there are any
1888          * other requests in the active queue for this process.  If there are
1889          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If
1890          * there are some still active then do nothing - we only want to wakeup
1891          * when all active aio requests for the process are complete.
1892          */
1893         if ( (entryp->flags & AIO_WAITING) != 0 ) {
1894                 int             active_requests;
1895
1896                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1897                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1898
1899                 AIO_LOCK;
1900                 active_requests = aio_active_requests_for_process( entryp->procp );
1901                 AIO_UNLOCK;
1902                 if ( active_requests < 1 ) {
1903                         /* no active aio requests for this process, continue exiting */
1904
1905                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1906                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1907
1908                         wakeup_one( &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1909                 }
1910                 return;
1911         }
1912
1913         /*
1914          * aio_suspend case when a signal was not requested.  In that scenario we
1915          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1916          * NOTE - the assumption here is that this wakeup call is inexpensive.
1917          * we really only need to do this when an aio_suspend call is pending.
1918          * If we find the wakeup call should be avoided we could mark the
1919          * async IO requests given in the list provided by aio_suspend and only
1920          * call wakeup for them.  If we do mark them we should unmark them after
1921          * the aio_suspend wakes up.
1922          */
1923         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1924                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1925
1926         wakeup_one( &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1927
1928         return;
1929
1930 } /* do_aio_completion */
1931
1932
1933 /*
1934  * aio_last_group_io - checks to see if this is the last unfinished IO request
1935  * for the given group_tag.  Returns TRUE if there are no other active IO
1936  * requests for this group or FALSE if the are active IO requests
1937  * NOTE - AIO_LOCK must be held by caller
1938  */
1939
1940 static boolean_t
1941 aio_last_group_io( aio_workq_entry *entryp )
1942 {
1943         aio_workq_entry                         *my_entryp;
1944
1945         /* look for matches on our queue of active async IO requests */
1946         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1947                 if ( my_entryp->group_tag == entryp->group_tag )
1948                         return( FALSE );
1949         }
1950
1951         /* look for matches on our queue of asynchronous todo work */
1952         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1953                 if ( my_entryp->group_tag == entryp->group_tag )
1954                         return( FALSE );
1955         }
1956
1957         /* look for matches on our queue of synchronous todo work */
1958         TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1959                 if ( my_entryp->group_tag == entryp->group_tag )
1960                         return( FALSE );
1961         }
1962
1963         return( TRUE );
1964
1965 } /* aio_last_group_io */
1966
1967
1968 /*
1969  * do_aio_read
1970  */
1971 static int
1972 do_aio_read( aio_workq_entry *entryp )
1973 {
1974         struct file                     *fp;
1975         int                                             error;
1976
1977         fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FREAD );
1978         if ( fp != NULL ) {
1979                 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
1980                                                         (void *)entryp->aiocb.aio_buf,
1981                                                         entryp->aiocb.aio_nbytes,
1982                                                         entryp->aiocb.aio_offset, FOF_OFFSET,
1983                                                         &entryp->returnval );
1984                 frele( fp );
1985         }
1986         else
1987                 error = EBADF;
1988
1989         return( error );
1990
1991 } /* do_aio_read */
1992
1993
1994 /*
1995  * do_aio_write
1996  */
1997 static int
1998 do_aio_write( aio_workq_entry *entryp )
1999 {
2000         struct file                     *fp;
2001         int                                             error;
2002
2003         fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FWRITE );
2004         if ( fp != NULL ) {
2005                 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes,
2006                                                          (const void *)entryp->aiocb.aio_buf,
2007                                                          entryp->aiocb.aio_nbytes,
2008                                                          entryp->aiocb.aio_offset, FOF_OFFSET,
2009                                                          &entryp->returnval );
2010                 frele( fp );
2011         }
2012         else
2013                 error = EBADF;
2014
2015         return( error );
2016
2017 } /* do_aio_write */
2018
2019
2020 /*
2021  * aio_active_requests_for_process - return number of active async IO
2022  * requests for the given process.
2023  * NOTE - caller must hold aio lock!
2024  */
2025
2026 static int
2027 aio_active_requests_for_process( struct proc *procp )
2028 {
2029
2030         return( procp->aio_active_count );
2031
2032 } /* aio_active_requests_for_process */
2033
2034
2035 /*
2036  * do_aio_fsync
2037  */
2038 static int
2039 do_aio_fsync( aio_workq_entry *entryp )
2040 {
2041         register struct vnode   *vp;
2042         struct file                     *fp;
2043         int                                             error;
2044
2045         /*
2046          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2047          * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2048          * The following was shamelessly extracted from fsync() implementation.
2049          */
2050         error = getvnode( entryp->procp, entryp->aiocb.aio_fildes, &fp );
2051         if ( error == 0 ) {
2052                 vp = (struct vnode *)fp->f_data;
2053                 vn_lock( vp, LK_EXCLUSIVE | LK_RETRY, entryp->procp );
2054                 error = VOP_FSYNC( vp, fp->f_cred, MNT_WAIT, entryp->procp );
2055                 VOP_UNLOCK( vp, 0, entryp->procp );
2056         }
2057         if ( error != 0 )
2058                 entryp->returnval = -1;
2059
2060         return( error );
2061
2062 } /* do_aio_fsync */
2063
2064
2065 /*
2066  * is_already_queued - runs through our queues to see if the given
2067  * aiocbp / process is there.  Returns TRUE if there is a match
2068  * on any of our aio queues.
2069  * NOTE - callers must hold aio lock!
2070  */
2071
2072 static boolean_t
2073 is_already_queued(      struct proc *procp,
2074                                         struct aiocb *aiocbp )
2075 {
2076         aio_workq_entry                 *entryp;
2077         boolean_t                               result;
2078
2079         result = FALSE;
2080
2081         /* look for matches on our queue of async IO requests that have completed */
2082         TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2083                 if ( aiocbp == entryp->uaiocbp ) {
2084                         result = TRUE;
2085                         goto ExitThisRoutine;
2086                 }
2087         }
2088
2089         /* look for matches on our queue of active async IO requests */
2090         TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2091                 if ( aiocbp == entryp->uaiocbp ) {
2092                         result = TRUE;
2093                         goto ExitThisRoutine;
2094                 }
2095         }
2096
2097         /* look for matches on our queue of asynchronous todo work */
2098         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2099                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2100                         result = TRUE;
2101                         goto ExitThisRoutine;
2102                 }
2103         }
2104
2105         /* look for matches on our queue of synchronous todo work */
2106         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2107                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2108                         result = TRUE;
2109                         goto ExitThisRoutine;
2110                 }
2111         }
2112
2113 ExitThisRoutine:
2114         return( result );
2115
2116 } /* is_already_queued */
2117
2118
2119 /*
2120  * aio initialization
2121  */
2122 __private_extern__ void
2123 aio_init( void )
2124 {
2125         int                     i;
2126
2127         simple_lock_init( &aio_lock );
2128
2129         AIO_LOCK;
2130         TAILQ_INIT( &aio_anchor.aio_async_workq );
2131         TAILQ_INIT( &aio_anchor.lio_sync_workq );
2132         aio_anchor.aio_async_workq_count = 0;
2133         aio_anchor.lio_sync_workq_count = 0;
2134         aio_anchor.aio_active_count = 0;
2135         aio_anchor.aio_done_count = 0;
2136         AIO_UNLOCK;
2137
2138         i = sizeof( aio_workq_entry );
2139         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2140
2141         _aio_create_worker_threads( aio_worker_threads );
2142
2143         return;
2144
2145 } /* aio_init */
2146
2147
2148 /*
2149  * aio worker threads created here.
2150  */
2151 __private_extern__ void
2152 _aio_create_worker_threads( int num )
2153 {
2154         int                     i;
2155
2156         /* create some worker threads to handle the async IO requests */
2157         for ( i = 0; i < num; i++ ) {
2158                 thread_t                myThread;
2159
2160                 myThread = kernel_thread( kernel_task, aio_work_thread );
2161                 if ( THREAD_NULL == myThread ) {
2162                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2163                 }
2164         }
2165
2166         return;
2167
2168 } /* _aio_create_worker_threads */
2169
2170 /*
2171  * Return the current activation utask
2172  */
2173 task_t
2174 get_aiotask(void)
2175 {
2176         return  ((struct uthread *)get_bsdthread_info(current_act()))->uu_aio_task;
2177 }