bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24
  25 /*
  26  * todo:
  27  *              1) ramesh is looking into how to replace taking a reference on
  28  *                      the user's map (vm_map_reference()) since it is believed that
  29  *                      would not hold the process for us.
  30  *              2) david is looking into a way for us to set the priority of the
  31  *                      worker threads to match that of the user's thread when the
  32  *                      async IO was queued.
  33  */
  34
  35
  36 /*
  37  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  38  */
  39
  40 #include <sys/systm.h>
  41 #include <sys/fcntl.h>
  42 #include <sys/file_internal.h>
  43 #include <sys/filedesc.h>
  44 #include <sys/kernel.h>
  45 #include <sys/vnode_internal.h>
  46 #include <sys/malloc.h>
  47 #include <sys/mount_internal.h>
  48 #include <sys/param.h>
  49 #include <sys/proc_internal.h>
  50 #include <sys/sysctl.h>
  51 #include <sys/unistd.h>
  52 #include <sys/user.h>
  53
  54 #include <sys/aio_kern.h>
  55 #include <sys/sysproto.h>
  56
  57 #include <machine/limits.h>
  58
  59 #include <mach/mach_types.h>
  60 #include <kern/kern_types.h>
  61 #include <kern/zalloc.h>
  62 #include <kern/task.h>
  63 #include <kern/sched_prim.h>
  64
  65 #include <vm/vm_map.h>
  66
  67 #include <sys/kdebug.h>
  68 #define AIO_work_queued                                 1
  69 #define AIO_worker_wake                                 2
  70 #define AIO_completion_sig                              3
  71 #define AIO_completion_cleanup_wait             4
  72 #define AIO_completion_cleanup_wake             5
  73 #define AIO_completion_suspend_wake     6
  74 #define AIO_fsync_delay                                 7
  75 #define AIO_cancel                                              10
  76 #define AIO_cancel_async_workq                  11
  77 #define AIO_cancel_sync_workq                   12
  78 #define AIO_cancel_activeq                              13
  79 #define AIO_cancel_doneq                                14
  80 #define AIO_fsync                                               20
  81 #define AIO_read                                                30
  82 #define AIO_write                                               40
  83 #define AIO_listio                                              50
  84 #define AIO_error                                               60
  85 #define AIO_error_val                                   61
  86 #define AIO_error_activeq                               62
  87 #define AIO_error_workq                                 63
  88 #define AIO_return                                              70
  89 #define AIO_return_val                                  71
  90 #define AIO_return_activeq                              72
  91 #define AIO_return_workq                                73
  92 #define AIO_exec                                                80
  93 #define AIO_exit                                                90
  94 #define AIO_exit_sleep                                  91
  95 #define AIO_close                                               100
  96 #define AIO_close_sleep                                 101
  97 #define AIO_suspend                                             110
  98 #define AIO_suspend_sleep                               111
  99 #define AIO_worker_thread                               120
 100
 101 #if 0
 102 #undef KERNEL_DEBUG
 103 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 104 #endif
 105
 106 /*
 107  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 108  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 109  * (proc.aio_activeq) when one of our worker threads start the IO.
 110  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 111  * when the IO request completes.  The request remains on aio_doneq until
 112  * user process calls aio_return or the process exits, either way that is our
 113  * trigger to release aio resources.
 114  */
 115 struct aio_anchor_cb
 116 {
 117         int                                                                     aio_async_workq_count;  /* entries on aio_async_workq */
 118         int                                                                     lio_sync_workq_count;   /* entries on lio_sync_workq */
 119         int                                                                     aio_active_count;       /* entries on all active queues (proc.aio_activeq) */
 120         int                                                                     aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 121         TAILQ_HEAD( , aio_workq_entry )         aio_async_workq;
 122         TAILQ_HEAD( , aio_workq_entry )         lio_sync_workq;
 123 };
 124 typedef struct aio_anchor_cb aio_anchor_cb;
 125
 126
 127 /*
 128  * Notes on aio sleep / wake channels.
 129  * We currently pick a couple fields within the proc structure that will allow
 130  * us sleep channels that currently do not collide with any other kernel routines.
 131  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 132  */
 133 #define AIO_SUSPEND_SLEEP_CHAN  p_estcpu
 134 #define AIO_CLEANUP_SLEEP_CHAN  p_pctcpu
 135
 136
 137 /*
 138  * aysnc IO locking macros used to protect critical sections.
 139  */
 140 #define AIO_LOCK        lck_mtx_lock(aio_lock)
 141 #define AIO_UNLOCK      lck_mtx_unlock(aio_lock)
 142
 143
 144 /*
 145  *  LOCAL PROTOTYPES
 146  */
 147 static int                      aio_active_requests_for_process( struct proc *procp );
 148 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 149 static int                      aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
 150 static int                      aio_get_all_queues_count( void );
 151 static int                      aio_get_process_count( struct proc *procp );
 152 static aio_workq_entry *  aio_get_some_work( void );
 153 static boolean_t        aio_last_group_io( aio_workq_entry *entryp );
 154 static void                     aio_mark_requests( aio_workq_entry *entryp );
 155 static int                      aio_queue_async_request( struct proc *procp,
 156                                                                                          user_addr_t aiocbp,
 157                                                                                          int kindOfIO );
 158 static int                      aio_validate( aio_workq_entry *entryp );
 159 static void                     aio_work_thread( void );
 160 static int                      do_aio_cancel(  struct proc *p,
 161                                                                         int fd,
 162                                                                         user_addr_t aiocbp,
 163                                                                         boolean_t wait_for_completion,
 164                                                                         boolean_t disable_notification );
 165 static void                     do_aio_completion( aio_workq_entry *entryp );
 166 static int                      do_aio_fsync( aio_workq_entry *entryp );
 167 static int                      do_aio_read( aio_workq_entry *entryp );
 168 static int                      do_aio_write( aio_workq_entry *entryp );
 169 static void             do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 170 static boolean_t        is_already_queued(      struct proc *procp,
 171                                                                                 user_addr_t aiocbp );
 172 static int                      lio_create_async_entry( struct proc *procp,
 173                                                                                          user_addr_t aiocbp,
 174                                                                                          user_addr_t sigp,
 175                                                                                          long group_tag,
 176                                                                                          aio_workq_entry **entrypp );
 177 static int                      lio_create_sync_entry( struct proc *procp,
 178                                                                                         user_addr_t aiocbp,
 179                                                                                         long group_tag,
 180                                                                                         aio_workq_entry **entrypp );
 181
 182
 183 /*
 184  *  EXTERNAL PROTOTYPES
 185  */
 186
 187 /* in ...bsd/kern/sys_generic.c */
 188 extern int                      dofileread( struct proc *p, struct fileproc *fp, int fd,
 189                                                                 user_addr_t bufp, user_size_t nbyte,
 190                                                                 off_t offset, int flags, user_ssize_t *retval );
 191 extern int                      dofilewrite( struct proc *p, struct fileproc *fp, int fd,
 192                                                                  user_addr_t bufp, user_size_t nbyte, off_t offset,
 193                                                                  int flags, user_ssize_t *retval );
 194
 195 /*
 196  * aio external global variables.
 197  */
 198 extern int aio_max_requests;                            /* AIO_MAX - configurable */
 199 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 200 extern int aio_worker_threads;                          /* AIO_THREAD_COUNT - configurable */
 201
 202
 203 /*
 204  * aio static variables.
 205  */
 206 static aio_anchor_cb            aio_anchor;
 207 static lck_mtx_t *              aio_lock;
 208 static lck_grp_t *              aio_lock_grp;
 209 static lck_attr_t *             aio_lock_attr;
 210 static lck_grp_attr_t *         aio_lock_grp_attr;
 211 static struct zone              *aio_workq_zonep;
 212
 213
 214
 215
 216 /*
 217  * aio_cancel - attempt to cancel one or more async IO requests currently
 218  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 219  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 220  * is NULL then all outstanding async IO request for the given file
 221  * descriptor are cancelled (if possible).
 222  */
 223
 224 int
 225 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
 226 {
 227         struct user_aiocb               my_aiocb;
 228         int                                                     result;
 229
 230         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 231                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 232
 233         /* quick check to see if there are any async IO requests queued up */
 234         AIO_LOCK;
 235         result = aio_get_all_queues_count( );
 236         AIO_UNLOCK;
 237         if ( result < 1 ) {
 238                 result = EBADF;
 239                 goto ExitRoutine;
 240         }
 241
 242         *retval = -1;
 243         if ( uap->aiocbp != USER_ADDR_NULL ) {
 244                 if ( !IS_64BIT_PROCESS(p) ) {
 245                         struct aiocb aiocb32;
 246
 247                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
 248                         if ( result == 0 )
 249                                 do_munge_aiocb( &aiocb32, &my_aiocb );
 250                 } else
 251                         result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
 252
 253                 if ( result != 0 ) {
 254                         result = EAGAIN;
 255                         goto ExitRoutine;
 256                 }
 257
 258                 /* NOTE - POSIX standard says a mismatch between the file */
 259                 /* descriptor passed in and the file descriptor embedded in */
 260                 /* the aiocb causes unspecified results.  We return EBADF in */
 261                 /* that situation.  */
 262                 if ( uap->fd != my_aiocb.aio_fildes ) {
 263                         result = EBADF;
 264                         goto ExitRoutine;
 265                 }
 266         }
 267         result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
 268
 269         if ( result != -1 ) {
 270                 *retval = result;
 271                 result = 0;
 272                 goto ExitRoutine;
 273         }
 274
 275         result = EBADF;
 276
 277 ExitRoutine:
 278         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 279                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 280
 281         return( result );
 282
 283 } /* aio_cancel */
 284
 285
 286 /*
 287  * _aio_close - internal function used to clean up async IO requests for
 288  * a file descriptor that is closing.
 289  * THIS MAY BLOCK.
 290  */
 291
 292 __private_extern__ void
 293 _aio_close( struct proc *p, int fd )
 294 {
 295         int                     error, count;
 296
 297         /* quick check to see if there are any async IO requests queued up */
 298         AIO_LOCK;
 299         count = aio_get_all_queues_count( );
 300         AIO_UNLOCK;
 301         if ( count < 1 )
 302                 return;
 303
 304         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 305                           (int)p, fd, 0, 0, 0 );
 306
 307         /* cancel all async IO requests on our todo queues for this file descriptor */
 308         error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
 309         if ( error == AIO_NOTCANCELED ) {
 310                 /*
 311                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 312                  * and file descriptor on the active async IO queue.  Active requests cannot
 313                  * be cancelled so we must wait for them to complete.  We will get a special
 314                  * wake up call on our channel used to sleep for ALL active requests to
 315                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 316                  * when we must wait for all active aio requests.
 317                  */
 318
 319                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 320                                   (int)p, fd, 0, 0, 0 );
 321
 322                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
 323         }
 324
 325         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 326                           (int)p, fd, 0, 0, 0 );
 327
 328         return;
 329
 330 } /* _aio_close */
 331
 332
 333 /*
 334  * aio_error - return the error status associated with the async IO
 335  * request referred to by uap->aiocbp.  The error status is the errno
 336  * value that would be set by the corresponding IO request (read, wrtie,
 337  * fdatasync, or sync).
 338  */
 339
 340 int
 341 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
 342 {
 343         aio_workq_entry                         *entryp;
 344         int                                                     error;
 345
 346         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 347                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 348
 349         AIO_LOCK;
 350
 351         /* quick check to see if there are any async IO requests queued up */
 352         if ( aio_get_all_queues_count( ) < 1 ) {
 353                 error = EINVAL;
 354                 goto ExitRoutine;
 355         }
 356
 357         /* look for a match on our queue of async IO requests that have completed */
 358         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 359                 if ( entryp->uaiocbp == uap->aiocbp ) {
 360                         *retval = entryp->errorval;
 361                         error = 0;
 362                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 363                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 364                         goto ExitRoutine;
 365                 }
 366         }
 367
 368         /* look for a match on our queue of active async IO requests */
 369         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 370                 if ( entryp->uaiocbp == uap->aiocbp ) {
 371                         *retval = EINPROGRESS;
 372                         error = 0;
 373                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 374                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 375                         goto ExitRoutine;
 376                 }
 377         }
 378
 379         /* look for a match on our queue of todo work */
 380         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 381                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 382                         *retval = EINPROGRESS;
 383                         error = 0;
 384                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
 385                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 386                         goto ExitRoutine;
 387                 }
 388         }
 389         error = EINVAL;
 390
 391 ExitRoutine:
 392         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 393                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 394         AIO_UNLOCK;
 395
 396         return( error );
 397
 398 } /* aio_error */
 399
 400
 401 /*
 402  * aio_fsync - asynchronously force all IO operations associated
 403  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 404  * queued at the time of the call to the synchronized completion state.
 405  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 406  * fdatasync() call.
 407  */
 408
 409 int
 410 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
 411 {
 412         int                     error;
 413         int                     fsync_kind;
 414
 415         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 416                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 417
 418         *retval = 0;
 419         /* 0 := O_SYNC for binary backward compatibility with Panther */
 420         if (uap->op == O_SYNC || uap->op == 0)
 421                 fsync_kind = AIO_FSYNC;
 422 #if 0 // we don't support fdatasync() call yet
 423         else if ( uap->op == O_DSYNC )
 424                 fsync_kind = AIO_DSYNC;
 425 #endif
 426         else {
 427                 *retval = -1;
 428                 error = EINVAL;
 429                 goto ExitRoutine;
 430         }
 431
 432         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 433         if ( error != 0 )
 434                 *retval = -1;
 435
 436 ExitRoutine:
 437         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 438                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 439
 440         return( error );
 441
 442 } /* aio_fsync */
 443
 444
 445 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 446  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 447  * (uap->aiocbp->aio_buf).
 448  */
 449
 450 int
 451 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
 452 {
 453         int                     error;
 454
 455         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 456                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 457
 458         *retval = 0;
 459
 460         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 461         if ( error != 0 )
 462                 *retval = -1;
 463
 464         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 465                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 466
 467         return( error );
 468
 469 } /* aio_read */
 470
 471
 472 /*
 473  * aio_return - return the return status associated with the async IO
 474  * request referred to by uap->aiocbp.  The return status is the value
 475  * that would be returned by corresponding IO request (read, wrtie,
 476  * fdatasync, or sync).  This is where we release kernel resources
 477  * held for async IO call associated with the given aiocb pointer.
 478  */
 479
 480 int
 481 aio_return( struct proc *p, struct aio_return_args *uap, user_ssize_t *retval )
 482 {
 483         aio_workq_entry                         *entryp;
 484         int                                                     error;
 485         boolean_t                                       lock_held;
 486
 487         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 488                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 489
 490         AIO_LOCK;
 491         lock_held = TRUE;
 492         *retval = 0;
 493
 494         /* quick check to see if there are any async IO requests queued up */
 495         if ( aio_get_all_queues_count( ) < 1 ) {
 496                 error = EINVAL;
 497                 goto ExitRoutine;
 498         }
 499
 500         /* look for a match on our queue of async IO requests that have completed */
 501         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 502                 if ( entryp->uaiocbp == uap->aiocbp ) {
 503                         TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 504                         aio_anchor.aio_done_count--;
 505                         p->aio_done_count--;
 506
 507                         *retval = entryp->returnval;
 508
 509                         /* we cannot free requests that are still completing */
 510                         if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 511                                 vm_map_t                my_map;
 512
 513                                 my_map = entryp->aio_map;
 514                                 entryp->aio_map = VM_MAP_NULL;
 515                                 AIO_UNLOCK;
 516                                 lock_held = FALSE;
 517                                 aio_free_request( entryp, my_map );
 518                         }
 519                         else
 520                                 /* tell completion code to free this request */
 521                                 entryp->flags |= AIO_DO_FREE;
 522                         error = 0;
 523                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 524                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 525                         goto ExitRoutine;
 526                 }
 527         }
 528
 529         /* look for a match on our queue of active async IO requests */
 530         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 531                 if ( entryp->uaiocbp == uap->aiocbp ) {
 532                         error = EINPROGRESS;
 533                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 534                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 535                         goto ExitRoutine;
 536                 }
 537         }
 538
 539         /* look for a match on our queue of todo work */
 540         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 541                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 542                         error = EINPROGRESS;
 543                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
 544                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 545                         goto ExitRoutine;
 546                 }
 547         }
 548         error = EINVAL;
 549
 550 ExitRoutine:
 551         if ( lock_held )
 552                 AIO_UNLOCK;
 553         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 554                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 555
 556         return( error );
 557
 558 } /* aio_return */
 559
 560
 561 /*
 562  * _aio_exec - internal function used to clean up async IO requests for
 563  * a process that is going away due to exec().  We cancel any async IOs
 564  * we can and wait for those already active.  We also disable signaling
 565  * for cancelled or active aio requests that complete.
 566  * This routine MAY block!
 567  */
 568
 569 __private_extern__ void
 570 _aio_exec( struct proc *p )
 571 {
 572
 573         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 574                           (int)p, 0, 0, 0, 0 );
 575
 576         _aio_exit( p );
 577
 578         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 579                           (int)p, 0, 0, 0, 0 );
 580
 581         return;
 582
 583 } /* _aio_exec */
 584
 585
 586 /*
 587  * _aio_exit - internal function used to clean up async IO requests for
 588  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 589  * we can and wait for those already active.  We also disable signaling
 590  * for cancelled or active aio requests that complete.  This routine MAY block!
 591  */
 592
 593 __private_extern__ void
 594 _aio_exit( struct proc *p )
 595 {
 596         int                                             error, count;
 597         aio_workq_entry                 *entryp;
 598
 599         /* quick check to see if there are any async IO requests queued up */
 600         AIO_LOCK;
 601         count = aio_get_all_queues_count( );
 602         AIO_UNLOCK;
 603         if ( count < 1 ) {
 604                 return;
 605         }
 606
 607         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 608                           (int)p, 0, 0, 0, 0 );
 609
 610         /*
 611          * cancel async IO requests on the todo work queue and wait for those
 612          * already active to complete.
 613          */
 614         error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
 615         if ( error == AIO_NOTCANCELED ) {
 616                 /*
 617                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 618                  * on the active async IO queue.  Active requests cannot be cancelled so we
 619                  * must wait for them to complete.  We will get a special wake up call on
 620                  * our channel used to sleep for ALL active requests to complete.  This sleep
 621                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 622                  * active aio requests.
 623                  */
 624
 625                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 626                                   (int)p, 0, 0, 0, 0 );
 627
 628                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
 629         }
 630
 631         /* release all aio resources used by this process */
 632         AIO_LOCK;
 633         entryp = TAILQ_FIRST( &p->aio_doneq );
 634         while ( entryp != NULL ) {
 635                 aio_workq_entry                 *next_entryp;
 636
 637                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 638                 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 639                 aio_anchor.aio_done_count--;
 640                 p->aio_done_count--;
 641
 642                 /* we cannot free requests that are still completing */
 643                 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 644                         vm_map_t                my_map;
 645
 646                         my_map = entryp->aio_map;
 647                         entryp->aio_map = VM_MAP_NULL;
 648                         AIO_UNLOCK;
 649                         aio_free_request( entryp, my_map );
 650
 651                         /* need to start over since aio_doneq may have been */
 652                         /* changed while we were away.  */
 653                         AIO_LOCK;
 654                         entryp = TAILQ_FIRST( &p->aio_doneq );
 655                         continue;
 656                 }
 657                 else
 658                         /* tell completion code to free this request */
 659                         entryp->flags |= AIO_DO_FREE;
 660                 entryp = next_entryp;
 661         }
 662         AIO_UNLOCK;
 663
 664         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 665                           (int)p, 0, 0, 0, 0 );
 666
 667         return;
 668
 669 } /* _aio_exit */
 670
 671
 672 /*
 673  * do_aio_cancel - cancel async IO requests (if possible).  We get called by
 674  * aio_cancel, close, and at exit.
 675  * There are three modes of operation: 1) cancel all async IOs for a process -
 676  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 677  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 678  * aiocbp.
 679  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 680  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 681  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 682  * were already complete.
 683  * WARNING - do not deference aiocbp in this routine, it may point to user
 684  * land data that has not been copied in (when called from aio_cancel() )
 685  */
 686
 687 static int
 688 do_aio_cancel(  struct proc *p, int fd, user_addr_t aiocbp,
 689                                 boolean_t wait_for_completion, boolean_t disable_notification )
 690 {
 691         aio_workq_entry                 *entryp;
 692         int                                             result;
 693
 694         result = -1;
 695
 696         /* look for a match on our queue of async todo work. */
 697         AIO_LOCK;
 698         entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 699         while ( entryp != NULL ) {
 700                 aio_workq_entry                 *next_entryp;
 701
 702                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 703                 if ( p == entryp->procp ) {
 704                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 705                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 706                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 707                                 /* we found a match so we remove the entry from the */
 708                                 /* todo work queue and place it on the done queue */
 709                                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 710                                 aio_anchor.aio_async_workq_count--;
 711                                 entryp->errorval = ECANCELED;
 712                                 entryp->returnval = -1;
 713                                 if ( disable_notification )
 714                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 715                                 result = AIO_CANCELED;
 716
 717                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
 718                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 719
 720                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 721                                 aio_anchor.aio_done_count++;
 722                                 p->aio_done_count++;
 723                                 entryp->flags |= AIO_COMPLETION;
 724                                 AIO_UNLOCK;
 725
 726                                 /* do completion processing for this request */
 727                                 do_aio_completion( entryp );
 728
 729                                 AIO_LOCK;
 730                                 entryp->flags &= ~AIO_COMPLETION;
 731                                 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
 732                                         vm_map_t                my_map;
 733
 734                                         my_map = entryp->aio_map;
 735                                         entryp->aio_map = VM_MAP_NULL;
 736                                         AIO_UNLOCK;
 737                                         aio_free_request( entryp, my_map );
 738                                 }
 739                                 else
 740                                         AIO_UNLOCK;
 741
 742                                 if ( aiocbp != USER_ADDR_NULL ) {
 743                                         return( result );
 744                                 }
 745
 746                                 /* need to start over since aio_async_workq may have been */
 747                                 /* changed while we were away doing completion processing.  */
 748                                 AIO_LOCK;
 749                                 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 750                                 continue;
 751                         }
 752                 }
 753                 entryp = next_entryp;
 754         } /* while... */
 755
 756         /*
 757          * look for a match on our queue of synchronous todo work.  This will
 758          * be a rare occurrence but could happen if a process is terminated while
 759          * processing a lio_listio call.
 760          */
 761         entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 762         while ( entryp != NULL ) {
 763                 aio_workq_entry                 *next_entryp;
 764
 765                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 766                 if ( p == entryp->procp ) {
 767                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 768                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 769                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 770                                 /* we found a match so we remove the entry from the */
 771                                 /* todo work queue and place it on the done queue */
 772                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 773                                 aio_anchor.lio_sync_workq_count--;
 774                                 entryp->errorval = ECANCELED;
 775                                 entryp->returnval = -1;
 776                                 if ( disable_notification )
 777                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 778                                 result = AIO_CANCELED;
 779
 780                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
 781                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 782
 783                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 784                                 aio_anchor.aio_done_count++;
 785                                 p->aio_done_count++;
 786                                 if ( aiocbp != USER_ADDR_NULL ) {
 787                                         AIO_UNLOCK;
 788                                         return( result );
 789                                 }
 790                         }
 791                 }
 792                 entryp = next_entryp;
 793         } /* while... */
 794
 795         /*
 796          * look for a match on our queue of active async IO requests and
 797          * return AIO_NOTCANCELED result.
 798          */
 799         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 800                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 801                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 802                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 803                         result = AIO_NOTCANCELED;
 804
 805                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
 806                                                   (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 807
 808                         if ( wait_for_completion )
 809                                 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
 810                         if ( disable_notification )
 811                                 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 812                         if ( aiocbp != USER_ADDR_NULL ) {
 813                                 AIO_UNLOCK;
 814                                 return( result );
 815                         }
 816                 }
 817         }
 818
 819         /*
 820          * if we didn't find any matches on the todo or active queues then look for a
 821          * match on our queue of async IO requests that have completed and if found
 822          * return AIO_ALLDONE result.
 823          */
 824         if ( result == -1 ) {
 825                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 826                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 827                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 828                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 829                                 result = AIO_ALLDONE;
 830
 831                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
 832                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 833
 834                                 if ( aiocbp != USER_ADDR_NULL ) {
 835                                         AIO_UNLOCK;
 836                                         return( result );
 837                                 }
 838                         }
 839                 }
 840         }
 841         AIO_UNLOCK;
 842
 843         return( result );
 844
 845 } /* do_aio_cancel */
 846
 847
 848 /*
 849  * aio_suspend - suspend the calling thread until at least one of the async
 850  * IO operations referenced by uap->aiocblist has completed, until a signal
 851  * interrupts the function, or uap->timeoutp time interval (optional) has
 852  * passed.
 853  * Returns 0 if one or more async IOs have completed else -1 and errno is
 854  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
 855  * woke us up.
 856  */
 857
 858 int
 859 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
 860 {
 861         int                                     error;
 862         int                                     i, count;
 863         uint64_t                        abstime;
 864         struct user_timespec ts;
 865         aio_workq_entry         *entryp;
 866         user_addr_t                     *aiocbpp;
 867
 868         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
 869                           (int)p, uap->nent, 0, 0, 0 );
 870
 871         *retval = -1;
 872         abstime = 0;
 873         aiocbpp = NULL;
 874
 875         /* quick check to see if there are any async IO requests queued up */
 876         AIO_LOCK;
 877         count = aio_get_all_queues_count( );
 878         AIO_UNLOCK;
 879         if ( count < 1 ) {
 880                 error = EINVAL;
 881                 goto ExitThisRoutine;
 882         }
 883
 884         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
 885                 error = EINVAL;
 886                 goto ExitThisRoutine;
 887         }
 888
 889         if ( uap->timeoutp != USER_ADDR_NULL ) {
 890                 if ( proc_is64bit(p) ) {
 891                         error = copyin( uap->timeoutp, &ts, sizeof(ts) );
 892                 }
 893                 else {
 894                         struct timespec temp;
 895                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
 896                         if ( error == 0 ) {
 897                                 ts.tv_sec = temp.tv_sec;
 898                                 ts.tv_nsec = temp.tv_nsec;
 899                         }
 900                 }
 901                 if ( error != 0 ) {
 902                         error = EAGAIN;
 903                         goto ExitThisRoutine;
 904                 }
 905
 906                 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
 907                         error = EINVAL;
 908                         goto ExitThisRoutine;
 909                 }
 910
 911                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
 912                                                                          &abstime );
 913                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
 914         }
 915
 916         /* we reserve enough space for largest possible pointer size */
 917         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
 918         if ( aiocbpp == NULL ) {
 919                 error = EAGAIN;
 920                 goto ExitThisRoutine;
 921         }
 922
 923         /* copyin our aiocb pointers from list */
 924         error = copyin( uap->aiocblist, aiocbpp,
 925                                         proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
 926                                                                         : (uap->nent * sizeof(uintptr_t)) );
 927         if ( error != 0 ) {
 928                 error = EAGAIN;
 929                 goto ExitThisRoutine;
 930         }
 931
 932         /* we depend on a list of user_addr_t's so we need to munge and expand */
 933         /* when these pointers came from a 32-bit process */
 934         if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
 935                 /* position to the last entry and work back from there */
 936                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
 937                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
 938                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
 939                         *my_addrp = (user_addr_t) (*my_ptrp);
 940                 }
 941         }
 942
 943         /* check list of aio requests to see if any have completed */
 944         AIO_LOCK;
 945         for ( i = 0; i < uap->nent; i++ ) {
 946                 user_addr_t     aiocbp;
 947
 948                 /* NULL elements are legal so check for 'em */
 949                 aiocbp = *(aiocbpp + i);
 950                 if ( aiocbp == USER_ADDR_NULL )
 951                         continue;
 952
 953                 /* return immediately if any aio request in the list is done */
 954                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 955                         if ( entryp->uaiocbp == aiocbp ) {
 956                                 *retval = 0;
 957                                 error = 0;
 958                                 AIO_UNLOCK;
 959                                 goto ExitThisRoutine;
 960                         }
 961                 }
 962         } /* for ( ; i < uap->nent; ) */
 963
 964         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
 965                           (int)p, uap->nent, 0, 0, 0 );
 966
 967         /*
 968          * wait for an async IO to complete or a signal fires or timeout expires.
 969          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
 970          * interrupts us.  If an async IO completes before a signal fires or our
 971          * timeout expires, we get a wakeup call from aio_work_thread().
 972          */
 973         assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
 974         AIO_UNLOCK;
 975
 976         error = thread_block( THREAD_CONTINUE_NULL );
 977
 978         if ( error == THREAD_AWAKENED ) {
 979                 /* got our wakeup call from aio_work_thread() */
 980                 *retval = 0;
 981                 error = 0;
 982         }
 983         else if ( error == THREAD_TIMED_OUT ) {
 984                 /* our timeout expired */
 985                 error = EAGAIN;
 986         }
 987         else {
 988                 /* we were interrupted */
 989                 error = EINTR;
 990         }
 991
 992 ExitThisRoutine:
 993         if ( aiocbpp != NULL )
 994                 FREE( aiocbpp, M_TEMP );
 995
 996         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
 997                           (int)p, uap->nent, error, 0, 0 );
 998
 999         return( error );
1000
1001 } /* aio_suspend */
1002
1003
1004 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1005  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1006  * (uap->aiocbp->aio_buf).
1007  */
1008
1009 int
1010 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1011 {
1012         int                     error;
1013
1014         *retval = 0;
1015
1016         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1017                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1018
1019         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1020         if ( error != 0 )
1021                 *retval = -1;
1022
1023         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1024                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1025
1026         return( error );
1027
1028 } /* aio_write */
1029
1030
1031 /*
1032  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs
1033  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1034  * The caller gets error and return status for each aiocb in the list via aio_error
1035  * and aio_return.  We must keep completed requests until released by the
1036  * aio_return call.
1037  */
1038
1039 int
1040 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1041 {
1042         int                                                     i;
1043         int                                                     call_result;
1044         int                                                     result;
1045         long                                            group_tag;
1046         aio_workq_entry *                       *entryp_listp;
1047         user_addr_t                                     *aiocbpp;
1048
1049         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1050                           (int)p, uap->nent, uap->mode, 0, 0 );
1051
1052         entryp_listp = NULL;
1053         aiocbpp = NULL;
1054         call_result = -1;
1055         *retval = -1;
1056         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1057                 call_result = EINVAL;
1058                 goto ExitRoutine;
1059         }
1060
1061         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1062                 call_result = EINVAL;
1063                 goto ExitRoutine;
1064         }
1065
1066         /*
1067          * we use group_tag to mark IO requests for delayed completion processing
1068          * which means we wait until all IO requests in the group have completed
1069          * before we either return to the caller when mode is LIO_WAIT or signal
1070          * user when mode is LIO_NOWAIT.
1071          */
1072         group_tag = random();
1073
1074         /*
1075          * allocate a list of aio_workq_entry pointers that we will use to queue
1076          * up all our requests at once while holding our lock.
1077          */
1078         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1079         if ( entryp_listp == NULL ) {
1080                 call_result = EAGAIN;
1081                 goto ExitRoutine;
1082         }
1083
1084         /* we reserve enough space for largest possible pointer size */
1085         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1086         if ( aiocbpp == NULL ) {
1087                 call_result = EAGAIN;
1088                 goto ExitRoutine;
1089         }
1090
1091         /* copyin our aiocb pointers from list */
1092         result = copyin( uap->aiocblist, aiocbpp,
1093                                         IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1094                                                                                 : (uap->nent * sizeof(uintptr_t)) );
1095         if ( result != 0 ) {
1096                 call_result = EAGAIN;
1097                 goto ExitRoutine;
1098         }
1099
1100         /* we depend on a list of user_addr_t's so we need to munge and expand */
1101         /* when these pointers came from a 32-bit process */
1102         if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1103                 /* position to the last entry and work back from there */
1104                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1105                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1106                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1107                         *my_addrp = (user_addr_t) (*my_ptrp);
1108                 }
1109         }
1110
1111         /* process list of aio requests */
1112         for ( i = 0; i < uap->nent; i++ ) {
1113                 user_addr_t my_aiocbp;
1114
1115                 *(entryp_listp + i) = NULL;
1116                 my_aiocbp = *(aiocbpp + i);
1117
1118                 /* NULL elements are legal so check for 'em */
1119                 if ( my_aiocbp == USER_ADDR_NULL )
1120                         continue;
1121
1122                 if ( uap->mode == LIO_NOWAIT )
1123                         result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1124                                                                                          group_tag, (entryp_listp + i) );
1125                 else
1126                         result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1127                                                                                         (entryp_listp + i) );
1128
1129                 if ( result != 0 && call_result == -1 )
1130                         call_result = result;
1131         }
1132
1133         /*
1134          * we need to protect this section since we do not want any of these grouped
1135          * IO requests to begin until we have them all on the queue.
1136          */
1137         AIO_LOCK;
1138         for ( i = 0; i < uap->nent; i++ ) {
1139                 aio_workq_entry                         *entryp;
1140
1141                 /* NULL elements are legal so check for 'em */
1142                 entryp = *(entryp_listp + i);
1143                 if ( entryp == NULL )
1144                         continue;
1145
1146                 /* check our aio limits to throttle bad or rude user land behavior */
1147                 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1148                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1149                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1150                         vm_map_t                my_map;
1151
1152                         my_map = entryp->aio_map;
1153                         entryp->aio_map = VM_MAP_NULL;
1154                         if ( call_result == -1 )
1155                                 call_result = EAGAIN;
1156                         AIO_UNLOCK;
1157                         aio_free_request( entryp, my_map );
1158                         AIO_LOCK;
1159                         continue;
1160                 }
1161
1162                 /* place the request on the appropriate queue */
1163                 if ( uap->mode == LIO_NOWAIT ) {
1164                         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1165                         aio_anchor.aio_async_workq_count++;
1166
1167                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1168                                           (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1169                 }
1170                 else {
1171                         TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1172                         aio_anchor.lio_sync_workq_count++;
1173                 }
1174         }
1175
1176         if ( uap->mode == LIO_NOWAIT ) {
1177                 /* caller does not want to wait so we'll fire off a worker thread and return */
1178                 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1179         }
1180         else {
1181                 aio_workq_entry                 *entryp;
1182                 int                                     error;
1183
1184                 /*
1185                  * mode is LIO_WAIT - handle the IO requests now.
1186                  */
1187                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1188                 while ( entryp != NULL ) {
1189                         if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1190
1191                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1192                                 aio_anchor.lio_sync_workq_count--;
1193                                 AIO_UNLOCK;
1194
1195                                 if ( (entryp->flags & AIO_READ) != 0 ) {
1196                                         error = do_aio_read( entryp );
1197                                 }
1198                                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1199                                         error = do_aio_write( entryp );
1200                                 }
1201                                 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1202                                         error = do_aio_fsync( entryp );
1203                                 }
1204                                 else {
1205                                         printf( "%s - unknown aio request - flags 0x%02X \n",
1206                                                         __FUNCTION__, entryp->flags );
1207                                         error = EINVAL;
1208                                 }
1209                                 entryp->errorval = error;
1210                                 if ( error != 0 && call_result == -1 )
1211                                         call_result = EIO;
1212
1213                                 AIO_LOCK;
1214                                 /* we're done with the IO request so move it on the done queue */
1215                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1216                                 aio_anchor.aio_done_count++;
1217                                 p->aio_done_count++;
1218
1219                                 /* need to start over since lio_sync_workq may have been changed while we */
1220                                 /* were away doing the IO.  */
1221                                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1222                                 continue;
1223                         } /* p == entryp->procp */
1224
1225                         entryp = TAILQ_NEXT( entryp, aio_workq_link );
1226         } /* while ( entryp != NULL ) */
1227         } /* uap->mode == LIO_WAIT */
1228         AIO_UNLOCK;
1229
1230         /* call_result == -1 means we had no trouble queueing up requests */
1231         if ( call_result == -1 ) {
1232                 call_result = 0;
1233                 *retval = 0;
1234         }
1235
1236 ExitRoutine:
1237         if ( entryp_listp != NULL )
1238                 FREE( entryp_listp, M_TEMP );
1239         if ( aiocbpp != NULL )
1240                 FREE( aiocbpp, M_TEMP );
1241
1242         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1243                           (int)p, call_result, 0, 0, 0 );
1244
1245         return( call_result );
1246
1247 } /* lio_listio */
1248
1249
1250 /*
1251  * aio worker thread.  this is where all the real work gets done.
1252  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1253  * after new work is queued up.
1254  */
1255
1256 static void
1257 aio_work_thread( void )
1258 {
1259         aio_workq_entry                 *entryp;
1260
1261         for( ;; ) {
1262                 AIO_LOCK;
1263                 entryp = aio_get_some_work();
1264         if ( entryp == NULL ) {
1265                 /*
1266                  * aio worker threads wait for some work to get queued up
1267                  * by aio_queue_async_request.  Once some work gets queued
1268                  * it will wake up one of these worker threads just before
1269                  * returning to our caller in user land.
1270                  */
1271                         assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1272                         AIO_UNLOCK;
1273
1274                         thread_block( (thread_continue_t)aio_work_thread );
1275                         /* NOT REACHED */
1276         }
1277                 else {
1278                         int                     error;
1279                         vm_map_t                currentmap;
1280                         vm_map_t                oldmap = VM_MAP_NULL;
1281                         task_t                  oldaiotask = TASK_NULL;
1282                         struct uthread  *uthreadp = NULL;
1283
1284                         AIO_UNLOCK;
1285
1286                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1287                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1288
1289                         /*
1290                          * Assume the target's address space identity for the duration
1291                          * of the IO.
1292                          */
1293                         currentmap = get_task_map( (current_proc())->task );
1294                         if ( currentmap != entryp->aio_map ) {
1295                                 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1296                                 oldaiotask = uthreadp->uu_aio_task;
1297                                 uthreadp->uu_aio_task = entryp->procp->task;
1298                                 oldmap = vm_map_switch( entryp->aio_map );
1299                         }
1300
1301                         if ( (entryp->flags & AIO_READ) != 0 ) {
1302                                 error = do_aio_read( entryp );
1303                         }
1304                         else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1305                                 error = do_aio_write( entryp );
1306                         }
1307                         else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1308                                 error = do_aio_fsync( entryp );
1309                         }
1310                         else {
1311                                 printf( "%s - unknown aio request - flags 0x%02X \n",
1312                                                 __FUNCTION__, entryp->flags );
1313                                 error = EINVAL;
1314                         }
1315                         entryp->errorval = error;
1316                         if ( currentmap != entryp->aio_map ) {
1317                                 (void) vm_map_switch( oldmap );
1318                                 uthreadp->uu_aio_task = oldaiotask;
1319                         }
1320
1321                         /* we're done with the IO request so pop it off the active queue and */
1322                         /* push it on the done queue */
1323                         AIO_LOCK;
1324                         TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1325                         aio_anchor.aio_active_count--;
1326                         entryp->procp->aio_active_count--;
1327                         TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1328                         aio_anchor.aio_done_count++;
1329                         entryp->procp->aio_done_count++;
1330                         entryp->flags |= AIO_COMPLETION;
1331
1332                         /* remove our reference to the user land map. */
1333                         if ( VM_MAP_NULL != entryp->aio_map ) {
1334                                 vm_map_t                my_map;
1335
1336                                 my_map = entryp->aio_map;
1337                                 entryp->aio_map = VM_MAP_NULL;
1338                                 AIO_UNLOCK;  /* must unlock before calling vm_map_deallocate() */
1339                                 vm_map_deallocate( my_map );
1340                         }
1341                         else {
1342                                 AIO_UNLOCK;
1343                         }
1344
1345                         do_aio_completion( entryp );
1346
1347                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1348                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1349                                                   entryp->returnval, 0 );
1350
1351                         AIO_LOCK;
1352                         entryp->flags &= ~AIO_COMPLETION;
1353                         if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1354                                 vm_map_t                my_map;
1355
1356                                 my_map = entryp->aio_map;
1357                                 entryp->aio_map = VM_MAP_NULL;
1358                                 AIO_UNLOCK;
1359                                 aio_free_request( entryp, my_map );
1360                         }
1361                         else
1362                                 AIO_UNLOCK;
1363                 }
1364         } /* for ( ;; ) */
1365
1366         /* NOT REACHED */
1367
1368 } /* aio_work_thread */
1369
1370
1371 /*
1372  * aio_get_some_work - get the next async IO request that is ready to be executed.
1373  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1374  * IO requests at the time the aio_fsync call came in have completed.
1375  * NOTE - AIO_LOCK must be held by caller
1376  */
1377
1378 static aio_workq_entry *
1379 aio_get_some_work( void )
1380 {
1381         aio_workq_entry                         *entryp;
1382
1383         /* pop some work off the work queue and add to our active queue */
1384         for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1385                   entryp != NULL;
1386                   entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1387
1388                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1389                         /* leave aio_fsync calls on the work queue if there are IO */
1390                         /* requests on the active queue for the same file descriptor. */
1391                         if ( aio_delay_fsync_request( entryp ) ) {
1392
1393                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1394                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1395                                 continue;
1396                         }
1397                 }
1398                 break;
1399         }
1400
1401         if ( entryp != NULL ) {
1402                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1403                 aio_anchor.aio_async_workq_count--;
1404                 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1405                 aio_anchor.aio_active_count++;
1406                 entryp->procp->aio_active_count++;
1407         }
1408
1409         return( entryp );
1410
1411 } /* aio_get_some_work */
1412
1413
1414 /*
1415  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1416  * this time.  Delay will happen when there are any active IOs for the same file
1417  * descriptor that were queued at time the aio_sync call was queued.
1418  * NOTE - AIO_LOCK must be held by caller
1419  */
1420 static boolean_t
1421 aio_delay_fsync_request( aio_workq_entry *entryp )
1422 {
1423         aio_workq_entry                 *my_entryp;
1424
1425         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1426                 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1427                          entryp->uaiocbp == my_entryp->fsyncp &&
1428                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1429                         return( TRUE );
1430                 }
1431         }
1432
1433         return( FALSE );
1434
1435 } /* aio_delay_fsync_request */
1436
1437
1438 /*
1439  * aio_queue_async_request - queue up an async IO request on our work queue then
1440  * wake up one of our worker threads to do the actual work.  We get a reference
1441  * to our caller's user land map in order to keep it around while we are
1442  * processing the request.
1443  */
1444
1445 static int
1446 aio_queue_async_request( struct proc *procp, user_addr_t aiocbp, int kindOfIO )
1447 {
1448         aio_workq_entry                 *entryp;
1449         int                                             result;
1450
1451         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1452         if ( entryp == NULL ) {
1453                 result = EAGAIN;
1454                 goto error_exit;
1455         }
1456         bzero( entryp, sizeof(*entryp) );
1457
1458         /* fill in the rest of the aio_workq_entry */
1459         entryp->procp = procp;
1460         entryp->uaiocbp = aiocbp;
1461         entryp->flags |= kindOfIO;
1462         entryp->aio_map = VM_MAP_NULL;
1463
1464         if ( !IS_64BIT_PROCESS(procp) ) {
1465                 struct aiocb aiocb32;
1466
1467                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1468                 if ( result == 0 )
1469                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1470         } else
1471                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1472
1473         if ( result != 0 ) {
1474                 result = EAGAIN;
1475                 goto error_exit;
1476         }
1477
1478         /* do some more validation on the aiocb and embedded file descriptor */
1479         result = aio_validate( entryp );
1480         if ( result != 0 )
1481                 goto error_exit;
1482
1483         /* get a reference to the user land map in order to keep it around */
1484         entryp->aio_map = get_task_map( procp->task );
1485         vm_map_reference( entryp->aio_map );
1486
1487         AIO_LOCK;
1488
1489         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1490                 AIO_UNLOCK;
1491                 result = EAGAIN;
1492                 goto error_exit;
1493         }
1494
1495         /* check our aio limits to throttle bad or rude user land behavior */
1496         if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1497                  aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1498                 AIO_UNLOCK;
1499                 result = EAGAIN;
1500                 goto error_exit;
1501         }
1502
1503         /*
1504          * aio_fsync calls sync up all async IO requests queued at the time
1505          * the aio_fsync call was made.  So we mark each currently queued async
1506          * IO with a matching file descriptor as must complete before we do the
1507          * fsync.  We set the fsyncp field of each matching async IO
1508          * request with the aiocb pointer passed in on the aio_fsync call to
1509          * know which IOs must complete before we process the aio_fsync call.
1510          */
1511         if ( (kindOfIO & AIO_FSYNC) != 0 )
1512                 aio_mark_requests( entryp );
1513
1514         /* queue up on our aio asynchronous work queue */
1515         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1516         aio_anchor.aio_async_workq_count++;
1517
1518         wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1519         AIO_UNLOCK;
1520
1521         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1522                           (int)procp, (int)aiocbp, 0, 0, 0 );
1523
1524         return( 0 );
1525
1526 error_exit:
1527         if ( entryp != NULL ) {
1528                 /* this entry has not been queued up so no worries about unlocked */
1529                 /* state and aio_map */
1530                 aio_free_request( entryp, entryp->aio_map );
1531         }
1532
1533         return( result );
1534
1535 } /* aio_queue_async_request */
1536
1537
1538 /*
1539  * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1540  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1541  * our caller.  We get a reference to our caller's user land map in order to keep
1542  * it around while we are processing the request.
1543  * lio_listio calls behave differently at completion they do completion notification
1544  * when all async IO requests have completed.  We use group_tag to tag IO requests
1545  * that behave in the delay notification manner.
1546  */
1547
1548 static int
1549 lio_create_async_entry( struct proc *procp, user_addr_t aiocbp,
1550                                                  user_addr_t sigp, long group_tag,
1551                                                  aio_workq_entry **entrypp )
1552 {
1553         aio_workq_entry                         *entryp;
1554         int                                                     result;
1555
1556         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1557         if ( entryp == NULL ) {
1558                 result = EAGAIN;
1559                 goto error_exit;
1560         }
1561         bzero( entryp, sizeof(*entryp) );
1562
1563         /* fill in the rest of the aio_workq_entry */
1564         entryp->procp = procp;
1565         entryp->uaiocbp = aiocbp;
1566         entryp->flags |= AIO_LIO;
1567         entryp->group_tag = group_tag;
1568         entryp->aio_map = VM_MAP_NULL;
1569
1570         if ( !IS_64BIT_PROCESS(procp) ) {
1571                 struct aiocb aiocb32;
1572
1573                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1574                 if ( result == 0 )
1575                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1576         } else
1577                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1578
1579         if ( result != 0 ) {
1580                 result = EAGAIN;
1581                 goto error_exit;
1582         }
1583
1584         /* look for lio_listio LIO_NOP requests and ignore them. */
1585         /* Not really an error, but we need to free our aio_workq_entry.  */
1586         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1587                 result = 0;
1588                 goto error_exit;
1589         }
1590
1591         /* use sigevent passed in to lio_listio for each of our calls, but only */
1592         /* do completion notification after the last request completes. */
1593         if ( sigp != USER_ADDR_NULL ) {
1594                 if ( !IS_64BIT_PROCESS(procp) ) {
1595                         struct sigevent sigevent32;
1596
1597                         result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1598                         if ( result == 0 ) {
1599                                 /* also need to munge aio_sigevent since it contains pointers */
1600                                 /* special case here.  since we do not know if sigev_value is an */
1601                                 /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
1602                                 /* means if we send this info back to user space we need to remember */
1603                                 /* sigev_value was not expanded for the 32-bit case.  */
1604                                 /* NOTE - this does NOT affect us since we don't support sigev_value */
1605                                 /* yet in the aio context.  */
1606                                 //LP64
1607                                 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1608                                 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1609                                 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1610                                         sigevent32.sigev_value.sival_int;
1611                                 entryp->aiocb.aio_sigevent.sigev_notify_function =
1612                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1613                                 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1614                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1615                         }
1616                 } else
1617                         result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1618
1619                 if ( result != 0 ) {
1620                         result = EAGAIN;
1621                         goto error_exit;
1622                 }
1623         }
1624
1625         /* do some more validation on the aiocb and embedded file descriptor */
1626         result = aio_validate( entryp );
1627         if ( result != 0 )
1628                 goto error_exit;
1629
1630         /* get a reference to the user land map in order to keep it around */
1631         entryp->aio_map = get_task_map( procp->task );
1632         vm_map_reference( entryp->aio_map );
1633
1634         *entrypp = entryp;
1635         return( 0 );
1636
1637 error_exit:
1638         if ( entryp != NULL )
1639                 zfree( aio_workq_zonep, entryp );
1640
1641         return( result );
1642
1643 } /* lio_create_async_entry */
1644
1645
1646 /*
1647  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1648  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp
1649  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp
1650  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1651  * NOTE - AIO_LOCK must be held by caller
1652  */
1653
1654 static void
1655 aio_mark_requests( aio_workq_entry *entryp )
1656 {
1657         aio_workq_entry                 *my_entryp;
1658
1659         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1660                 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1661                         my_entryp->fsyncp = entryp->uaiocbp;
1662                 }
1663         }
1664
1665         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1666                 if ( entryp->procp == my_entryp->procp &&
1667                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1668                         my_entryp->fsyncp = entryp->uaiocbp;
1669                 }
1670         }
1671
1672 } /* aio_mark_requests */
1673
1674
1675 /*
1676  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1677  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1678  * our caller.
1679  * lio_listio calls behave differently at completion they do completion notification
1680  * when all async IO requests have completed.  We use group_tag to tag IO requests
1681  * that behave in the delay notification manner.
1682  */
1683
1684 static int
1685 lio_create_sync_entry( struct proc *procp, user_addr_t aiocbp,
1686                                                 long group_tag, aio_workq_entry **entrypp )
1687 {
1688         aio_workq_entry                         *entryp;
1689         int                                                     result;
1690
1691         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1692         if ( entryp == NULL ) {
1693                 result = EAGAIN;
1694                 goto error_exit;
1695         }
1696         bzero( entryp, sizeof(*entryp) );
1697
1698         /* fill in the rest of the aio_workq_entry */
1699         entryp->procp = procp;
1700         entryp->uaiocbp = aiocbp;
1701         entryp->flags |= AIO_LIO;
1702         entryp->group_tag = group_tag;
1703         entryp->aio_map = VM_MAP_NULL;
1704
1705         if ( !IS_64BIT_PROCESS(procp) ) {
1706                 struct aiocb aiocb32;
1707
1708                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1709                 if ( result == 0 )
1710                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1711         } else
1712                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1713
1714         if ( result != 0 ) {
1715                 result = EAGAIN;
1716                 goto error_exit;
1717         }
1718
1719         /* look for lio_listio LIO_NOP requests and ignore them. */
1720         /* Not really an error, but we need to free our aio_workq_entry.  */
1721         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1722                 result = 0;
1723                 goto error_exit;
1724         }
1725
1726         result = aio_validate( entryp );
1727         if ( result != 0 ) {
1728                 goto error_exit;
1729         }
1730
1731         *entrypp = entryp;
1732         return( 0 );
1733
1734 error_exit:
1735         if ( entryp != NULL )
1736                 zfree( aio_workq_zonep, entryp );
1737
1738         return( result );
1739
1740 } /* lio_create_sync_entry */
1741
1742
1743 /*
1744  * aio_free_request - remove our reference on the user land map and
1745  * free the work queue entry resources.
1746  * We are not holding the lock here thus aio_map is passed in and
1747  * zeroed while we did have the lock.
1748  */
1749
1750 static int
1751 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1752 {
1753         /* remove our reference to the user land map. */
1754         if ( VM_MAP_NULL != the_map ) {
1755                 vm_map_deallocate( the_map );
1756         }
1757
1758         zfree( aio_workq_zonep, entryp );
1759
1760         return( 0 );
1761
1762 } /* aio_free_request */
1763
1764
1765 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1766  */
1767
1768 static int
1769 aio_validate( aio_workq_entry *entryp )
1770 {
1771         struct fileproc                                 *fp;
1772         int                                                     flag;
1773         int                                                     result;
1774
1775         result = 0;
1776
1777         if ( (entryp->flags & AIO_LIO) != 0 ) {
1778                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1779                         entryp->flags |= AIO_READ;
1780                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1781                         entryp->flags |= AIO_WRITE;
1782                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1783                         return( 0 );
1784                 else
1785                         return( EINVAL );
1786         }
1787
1788         flag = FREAD;
1789         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1790                 flag = FWRITE;
1791         }
1792
1793         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1794                 // LP64todo - does max value for aio_nbytes need to grow?
1795                 if ( entryp->aiocb.aio_nbytes > INT_MAX         ||
1796                          entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1797                          entryp->aiocb.aio_offset < 0 )
1798                         return( EINVAL );
1799         }
1800
1801         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify
1802          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value,
1803          * sigev_notify_function, and sigev_notify_attributes are ignored.
1804          */
1805         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1806                 int             signum;
1807                 /* make sure we have a valid signal number */
1808                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1809                 if ( signum <= 0 || signum >= NSIG ||
1810                          signum == SIGKILL || signum == SIGSTOP )
1811                         return (EINVAL);
1812         }
1813         else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1814                 return (EINVAL);
1815
1816         /* validate the file descriptor and that the file was opened
1817          * for the appropriate read / write access.
1818          */
1819         proc_fdlock(entryp->procp);
1820
1821         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1822         if ( result == 0 ) {
1823                 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1824                         /* we don't have read or write access */
1825                         result = EBADF;
1826                 }
1827                 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1828                         /* this is not a file */
1829                         result = ESPIPE;
1830                 } else
1831                         fp->f_flags |= FP_AIOISSUED;
1832
1833                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1834         }
1835         else {
1836                 result = EBADF;
1837         }
1838
1839         proc_fdunlock(entryp->procp);
1840
1841         return( result );
1842
1843 } /* aio_validate */
1844
1845
1846 /*
1847  * aio_get_process_count - runs through our queues that hold outstanding
1848  * async IO reqests and totals up number of requests for the given
1849  * process.
1850  * NOTE - caller must hold aio lock!
1851  */
1852
1853 static int
1854 aio_get_process_count( struct proc *procp )
1855 {
1856         aio_workq_entry                         *entryp;
1857         int                                                     count;
1858
1859         /* begin with count of completed async IO requests for this process */
1860         count = procp->aio_done_count;
1861
1862         /* add in count of active async IO requests for this process */
1863         count += procp->aio_active_count;
1864
1865         /* look for matches on our queue of asynchronous todo work */
1866         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1867                 if ( procp == entryp->procp ) {
1868                         count++;
1869                 }
1870         }
1871
1872         /* look for matches on our queue of synchronous todo work */
1873         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1874                 if ( procp == entryp->procp ) {
1875                         count++;
1876                 }
1877         }
1878
1879         return( count );
1880
1881 } /* aio_get_process_count */
1882
1883
1884 /*
1885  * aio_get_all_queues_count - get total number of entries on all aio work queues.
1886  * NOTE - caller must hold aio lock!
1887  */
1888
1889 static int
1890 aio_get_all_queues_count( void )
1891 {
1892         int                                                     count;
1893
1894         count = aio_anchor.aio_async_workq_count;
1895         count += aio_anchor.lio_sync_workq_count;
1896         count += aio_anchor.aio_active_count;
1897         count += aio_anchor.aio_done_count;
1898
1899         return( count );
1900
1901 } /* aio_get_all_queues_count */
1902
1903
1904 /*
1905  * do_aio_completion.  Handle async IO completion.
1906  */
1907
1908 static void
1909 do_aio_completion( aio_workq_entry *entryp )
1910 {
1911         /* signal user land process if appropriate */
1912         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1913                  (entryp->flags & AIO_DISABLE) == 0 ) {
1914
1915                 /*
1916                  * if group_tag is non zero then make sure this is the last IO request
1917                  * in the group before we signal.
1918                  */
1919                 if ( entryp->group_tag == 0 ||
1920                          (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1921                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1922                                                   (int)entryp->procp, (int)entryp->uaiocbp,
1923                                                   entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1924
1925                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1926                         return;
1927                 }
1928         }
1929
1930         /*
1931          * need to handle case where a process is trying to exit, exec, or close
1932          * and is currently waiting for active aio requests to complete.  If
1933          * AIO_WAITING is set then we need to look to see if there are any
1934          * other requests in the active queue for this process.  If there are
1935          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If
1936          * there are some still active then do nothing - we only want to wakeup
1937          * when all active aio requests for the process are complete.
1938          */
1939         if ( (entryp->flags & AIO_WAITING) != 0 ) {
1940                 int             active_requests;
1941
1942                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1943                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1944
1945                 AIO_LOCK;
1946                 active_requests = aio_active_requests_for_process( entryp->procp );
1947                 //AIO_UNLOCK;
1948                 if ( active_requests < 1 ) {
1949                         /* no active aio requests for this process, continue exiting */
1950                         wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1951
1952                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1953                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1954                 }
1955                 AIO_UNLOCK;
1956                 return;
1957         }
1958
1959         /*
1960          * aio_suspend case when a signal was not requested.  In that scenario we
1961          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1962          * NOTE - the assumption here is that this wakeup call is inexpensive.
1963          * we really only need to do this when an aio_suspend call is pending.
1964          * If we find the wakeup call should be avoided we could mark the
1965          * async IO requests given in the list provided by aio_suspend and only
1966          * call wakeup for them.  If we do mark them we should unmark them after
1967          * the aio_suspend wakes up.
1968          */
1969         AIO_LOCK;
1970         wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1971         AIO_UNLOCK;
1972
1973         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1974                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1975
1976         return;
1977
1978 } /* do_aio_completion */
1979
1980
1981 /*
1982  * aio_last_group_io - checks to see if this is the last unfinished IO request
1983  * for the given group_tag.  Returns TRUE if there are no other active IO
1984  * requests for this group or FALSE if the are active IO requests
1985  * NOTE - AIO_LOCK must be held by caller
1986  */
1987
1988 static boolean_t
1989 aio_last_group_io( aio_workq_entry *entryp )
1990 {
1991         aio_workq_entry                         *my_entryp;
1992
1993         /* look for matches on our queue of active async IO requests */
1994         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1995                 if ( my_entryp->group_tag == entryp->group_tag )
1996                         return( FALSE );
1997         }
1998
1999         /* look for matches on our queue of asynchronous todo work */
2000         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2001                 if ( my_entryp->group_tag == entryp->group_tag )
2002                         return( FALSE );
2003         }
2004
2005         /* look for matches on our queue of synchronous todo work */
2006         TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2007                 if ( my_entryp->group_tag == entryp->group_tag )
2008                         return( FALSE );
2009         }
2010
2011         return( TRUE );
2012
2013 } /* aio_last_group_io */
2014
2015
2016 /*
2017  * do_aio_read
2018  */
2019 static int
2020 do_aio_read( aio_workq_entry *entryp )
2021 {
2022         struct fileproc                         *fp;
2023         int                                             error;
2024
2025         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2026                 return(error);
2027         if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2028                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2029                 return(EBADF);
2030         }
2031         if ( fp != NULL ) {
2032                 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
2033                                                         entryp->aiocb.aio_buf,
2034                                                         entryp->aiocb.aio_nbytes,
2035                                                         entryp->aiocb.aio_offset, FOF_OFFSET,
2036                                                         &entryp->returnval );
2037                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2038         }
2039         else {
2040                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2041                 error = EBADF;
2042         }
2043
2044         return( error );
2045
2046 } /* do_aio_read */
2047
2048
2049 /*
2050  * do_aio_write
2051  */
2052 static int
2053 do_aio_write( aio_workq_entry *entryp )
2054 {
2055         struct fileproc                 *fp;
2056         int                                             error;
2057
2058         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2059                 return(error);
2060         if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2061                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2062                 return(EBADF);
2063         }
2064         if ( fp != NULL ) {
2065                 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes,
2066                                                          entryp->aiocb.aio_buf,
2067                                                          entryp->aiocb.aio_nbytes,
2068                                                          entryp->aiocb.aio_offset, FOF_OFFSET,
2069                                                          &entryp->returnval );
2070
2071                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2072         }
2073         else {
2074                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2075                 error = EBADF;
2076         }
2077
2078         return( error );
2079
2080 } /* do_aio_write */
2081
2082
2083 /*
2084  * aio_active_requests_for_process - return number of active async IO
2085  * requests for the given process.
2086  * NOTE - caller must hold aio lock!
2087  */
2088
2089 static int
2090 aio_active_requests_for_process( struct proc *procp )
2091 {
2092
2093         return( procp->aio_active_count );
2094
2095 } /* aio_active_requests_for_process */
2096
2097
2098 /*
2099  * do_aio_fsync
2100  */
2101 static int
2102 do_aio_fsync( aio_workq_entry *entryp )
2103 {
2104         struct vfs_context      context;
2105         struct vnode            *vp;
2106         struct fileproc         *fp;
2107         int                                     error;
2108
2109         /*
2110          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2111          * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2112          * The following was shamelessly extracted from fsync() implementation.
2113          */
2114
2115         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2116         if ( error == 0 ) {
2117                 if ( (error = vnode_getwithref(vp)) ) {
2118                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2119                         entryp->returnval = -1;
2120                         return(error);
2121                 }
2122                 context.vc_proc = entryp->procp;
2123                 context.vc_ucred = fp->f_fglob->fg_cred;
2124
2125                 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2126
2127                 (void)vnode_put(vp);
2128
2129                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2130         }
2131         if ( error != 0 )
2132                 entryp->returnval = -1;
2133
2134         return( error );
2135
2136 } /* do_aio_fsync */
2137
2138
2139 /*
2140  * is_already_queued - runs through our queues to see if the given
2141  * aiocbp / process is there.  Returns TRUE if there is a match
2142  * on any of our aio queues.
2143  * NOTE - callers must hold aio lock!
2144  */
2145
2146 static boolean_t
2147 is_already_queued(      struct proc *procp,
2148                                         user_addr_t aiocbp )
2149 {
2150         aio_workq_entry                 *entryp;
2151         boolean_t                               result;
2152
2153         result = FALSE;
2154
2155         /* look for matches on our queue of async IO requests that have completed */
2156         TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2157                 if ( aiocbp == entryp->uaiocbp ) {
2158                         result = TRUE;
2159                         goto ExitThisRoutine;
2160                 }
2161         }
2162
2163         /* look for matches on our queue of active async IO requests */
2164         TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2165                 if ( aiocbp == entryp->uaiocbp ) {
2166                         result = TRUE;
2167                         goto ExitThisRoutine;
2168                 }
2169         }
2170
2171         /* look for matches on our queue of asynchronous todo work */
2172         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2173                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2174                         result = TRUE;
2175                         goto ExitThisRoutine;
2176                 }
2177         }
2178
2179         /* look for matches on our queue of synchronous todo work */
2180         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2181                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2182                         result = TRUE;
2183                         goto ExitThisRoutine;
2184                 }
2185         }
2186
2187 ExitThisRoutine:
2188         return( result );
2189
2190 } /* is_already_queued */
2191
2192
2193 /*
2194  * aio initialization
2195  */
2196 __private_extern__ void
2197 aio_init( void )
2198 {
2199         int                     i;
2200
2201         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2202         lck_grp_attr_setstat(aio_lock_grp_attr);
2203         aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2204         aio_lock_attr = lck_attr_alloc_init();
2205         //lck_attr_setdebug(aio_lock_attr);
2206
2207         aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2208
2209         AIO_LOCK;
2210         TAILQ_INIT( &aio_anchor.aio_async_workq );
2211         TAILQ_INIT( &aio_anchor.lio_sync_workq );
2212         aio_anchor.aio_async_workq_count = 0;
2213         aio_anchor.lio_sync_workq_count = 0;
2214         aio_anchor.aio_active_count = 0;
2215         aio_anchor.aio_done_count = 0;
2216         AIO_UNLOCK;
2217
2218         i = sizeof( aio_workq_entry );
2219         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2220
2221         _aio_create_worker_threads( aio_worker_threads );
2222
2223         return;
2224
2225 } /* aio_init */
2226
2227
2228 /*
2229  * aio worker threads created here.
2230  */
2231 __private_extern__ void
2232 _aio_create_worker_threads( int num )
2233 {
2234         int                     i;
2235
2236         /* create some worker threads to handle the async IO requests */
2237         for ( i = 0; i < num; i++ ) {
2238                 thread_t                myThread;
2239
2240                 myThread = kernel_thread( kernel_task, aio_work_thread );
2241                 if ( THREAD_NULL == myThread ) {
2242                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2243                 }
2244         }
2245
2246         return;
2247
2248 } /* _aio_create_worker_threads */
2249
2250 /*
2251  * Return the current activation utask
2252  */
2253 task_t
2254 get_aiotask(void)
2255 {
2256         return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2257 }
2258
2259
2260 /*
2261  * In the case of an aiocb from a
2262  * 32-bit process we need to expand some longs and pointers to the correct
2263  * sizes in order to let downstream code always work on the same type of
2264  * aiocb (in our case that is a user_aiocb)
2265  */
2266 static void
2267 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2268 {
2269         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2270         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2271         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2272         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2273         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2274         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2275
2276         /* special case here.  since we do not know if sigev_value is an */
2277         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2278         /* means if we send this info back to user space we need to remember */
2279         /* sigev_value was not expanded for the 32-bit case.  */
2280         /* NOTE - this does NOT affect us since we don't support sigev_value */
2281         /* yet in the aio context.  */
2282         //LP64
2283         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2284         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2285         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2286                 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2287         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2288                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2289         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2290                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2291 }