bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29
  30 /*
  31  * todo:
  32  *              1) ramesh is looking into how to replace taking a reference on
  33  *                      the user's map (vm_map_reference()) since it is believed that
  34  *                      would not hold the process for us.
  35  *              2) david is looking into a way for us to set the priority of the
  36  *                      worker threads to match that of the user's thread when the
  37  *                      async IO was queued.
  38  */
  39
  40
  41 /*
  42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  43  */
  44
  45 #include <sys/systm.h>
  46 #include <sys/fcntl.h>
  47 #include <sys/file_internal.h>
  48 #include <sys/filedesc.h>
  49 #include <sys/kernel.h>
  50 #include <sys/vnode_internal.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount_internal.h>
  53 #include <sys/param.h>
  54 #include <sys/proc_internal.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/unistd.h>
  57 #include <sys/user.h>
  58
  59 #include <sys/aio_kern.h>
  60 #include <sys/sysproto.h>
  61
  62 #include <machine/limits.h>
  63
  64 #include <mach/mach_types.h>
  65 #include <kern/kern_types.h>
  66 #include <kern/zalloc.h>
  67 #include <kern/task.h>
  68 #include <kern/sched_prim.h>
  69
  70 #include <vm/vm_map.h>
  71
  72 #include <sys/kdebug.h>
  73 #define AIO_work_queued                                 1
  74 #define AIO_worker_wake                                 2
  75 #define AIO_completion_sig                              3
  76 #define AIO_completion_cleanup_wait             4
  77 #define AIO_completion_cleanup_wake             5
  78 #define AIO_completion_suspend_wake     6
  79 #define AIO_fsync_delay                                 7
  80 #define AIO_cancel                                              10
  81 #define AIO_cancel_async_workq                  11
  82 #define AIO_cancel_sync_workq                   12
  83 #define AIO_cancel_activeq                              13
  84 #define AIO_cancel_doneq                                14
  85 #define AIO_fsync                                               20
  86 #define AIO_read                                                30
  87 #define AIO_write                                               40
  88 #define AIO_listio                                              50
  89 #define AIO_error                                               60
  90 #define AIO_error_val                                   61
  91 #define AIO_error_activeq                               62
  92 #define AIO_error_workq                                 63
  93 #define AIO_return                                              70
  94 #define AIO_return_val                                  71
  95 #define AIO_return_activeq                              72
  96 #define AIO_return_workq                                73
  97 #define AIO_exec                                                80
  98 #define AIO_exit                                                90
  99 #define AIO_exit_sleep                                  91
 100 #define AIO_close                                               100
 101 #define AIO_close_sleep                                 101
 102 #define AIO_suspend                                             110
 103 #define AIO_suspend_sleep                               111
 104 #define AIO_worker_thread                               120
 105
 106 #if 0
 107 #undef KERNEL_DEBUG
 108 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 109 #endif
 110
 111 /*
 112  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 113  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 114  * (proc.aio_activeq) when one of our worker threads start the IO.
 115  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 116  * when the IO request completes.  The request remains on aio_doneq until
 117  * user process calls aio_return or the process exits, either way that is our
 118  * trigger to release aio resources.
 119  */
 120 struct aio_anchor_cb
 121 {
 122         int                                                                     aio_async_workq_count;  /* entries on aio_async_workq */
 123         int                                                                     lio_sync_workq_count;   /* entries on lio_sync_workq */
 124         int                                                                     aio_active_count;       /* entries on all active queues (proc.aio_activeq) */
 125         int                                                                     aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 126         TAILQ_HEAD( , aio_workq_entry )         aio_async_workq;
 127         TAILQ_HEAD( , aio_workq_entry )         lio_sync_workq;
 128 };
 129 typedef struct aio_anchor_cb aio_anchor_cb;
 130
 131
 132 /*
 133  * Notes on aio sleep / wake channels.
 134  * We currently pick a couple fields within the proc structure that will allow
 135  * us sleep channels that currently do not collide with any other kernel routines.
 136  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 137  */
 138 #define AIO_SUSPEND_SLEEP_CHAN  p_estcpu
 139 #define AIO_CLEANUP_SLEEP_CHAN  p_pctcpu
 140
 141
 142 /*
 143  * aysnc IO locking macros used to protect critical sections.
 144  */
 145 #define AIO_LOCK        lck_mtx_lock(aio_lock)
 146 #define AIO_UNLOCK      lck_mtx_unlock(aio_lock)
 147
 148
 149 /*
 150  *  LOCAL PROTOTYPES
 151  */
 152 static int                      aio_active_requests_for_process( struct proc *procp );
 153 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 154 static int                      aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
 155 static int                      aio_get_all_queues_count( void );
 156 static int                      aio_get_process_count( struct proc *procp );
 157 static aio_workq_entry *  aio_get_some_work( void );
 158 static boolean_t        aio_last_group_io( aio_workq_entry *entryp );
 159 static void                     aio_mark_requests( aio_workq_entry *entryp );
 160 static int                      aio_queue_async_request( struct proc *procp,
 161                                                                                          user_addr_t aiocbp,
 162                                                                                          int kindOfIO );
 163 static int                      aio_validate( aio_workq_entry *entryp );
 164 static void                     aio_work_thread( void );
 165 static int                      do_aio_cancel(  struct proc *p,
 166                                                                         int fd,
 167                                                                         user_addr_t aiocbp,
 168                                                                         boolean_t wait_for_completion,
 169                                                                         boolean_t disable_notification );
 170 static void                     do_aio_completion( aio_workq_entry *entryp );
 171 static int                      do_aio_fsync( aio_workq_entry *entryp );
 172 static int                      do_aio_read( aio_workq_entry *entryp );
 173 static int                      do_aio_write( aio_workq_entry *entryp );
 174 static void             do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 175 static boolean_t        is_already_queued(      struct proc *procp,
 176                                                                                 user_addr_t aiocbp );
 177 static int                      lio_create_async_entry( struct proc *procp,
 178                                                                                          user_addr_t aiocbp,
 179                                                                                          user_addr_t sigp,
 180                                                                                          long group_tag,
 181                                                                                          aio_workq_entry **entrypp );
 182 static int                      lio_create_sync_entry( struct proc *procp,
 183                                                                                         user_addr_t aiocbp,
 184                                                                                         long group_tag,
 185                                                                                         aio_workq_entry **entrypp );
 186
 187
 188 /*
 189  *  EXTERNAL PROTOTYPES
 190  */
 191
 192 /* in ...bsd/kern/sys_generic.c */
 193 extern int                      dofileread( struct proc *p, struct fileproc *fp, int fd,
 194                                                                 user_addr_t bufp, user_size_t nbyte,
 195                                                                 off_t offset, int flags, user_ssize_t *retval );
 196 extern int                      dofilewrite( struct proc *p, struct fileproc *fp, int fd,
 197                                                                  user_addr_t bufp, user_size_t nbyte, off_t offset,
 198                                                                  int flags, user_ssize_t *retval );
 199
 200 /*
 201  * aio external global variables.
 202  */
 203 extern int aio_max_requests;                            /* AIO_MAX - configurable */
 204 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 205 extern int aio_worker_threads;                          /* AIO_THREAD_COUNT - configurable */
 206
 207
 208 /*
 209  * aio static variables.
 210  */
 211 static aio_anchor_cb            aio_anchor;
 212 static lck_mtx_t *              aio_lock;
 213 static lck_grp_t *              aio_lock_grp;
 214 static lck_attr_t *             aio_lock_attr;
 215 static lck_grp_attr_t *         aio_lock_grp_attr;
 216 static struct zone              *aio_workq_zonep;
 217
 218
 219
 220
 221 /*
 222  * aio_cancel - attempt to cancel one or more async IO requests currently
 223  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 224  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 225  * is NULL then all outstanding async IO request for the given file
 226  * descriptor are cancelled (if possible).
 227  */
 228
 229 int
 230 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
 231 {
 232         struct user_aiocb               my_aiocb;
 233         int                                                     result;
 234
 235         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 236                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 237
 238         /* quick check to see if there are any async IO requests queued up */
 239         AIO_LOCK;
 240         result = aio_get_all_queues_count( );
 241         AIO_UNLOCK;
 242         if ( result < 1 ) {
 243                 result = EBADF;
 244                 goto ExitRoutine;
 245         }
 246
 247         *retval = -1;
 248         if ( uap->aiocbp != USER_ADDR_NULL ) {
 249                 if ( !IS_64BIT_PROCESS(p) ) {
 250                         struct aiocb aiocb32;
 251
 252                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
 253                         if ( result == 0 )
 254                                 do_munge_aiocb( &aiocb32, &my_aiocb );
 255                 } else
 256                         result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
 257
 258                 if ( result != 0 ) {
 259                         result = EAGAIN;
 260                         goto ExitRoutine;
 261                 }
 262
 263                 /* NOTE - POSIX standard says a mismatch between the file */
 264                 /* descriptor passed in and the file descriptor embedded in */
 265                 /* the aiocb causes unspecified results.  We return EBADF in */
 266                 /* that situation.  */
 267                 if ( uap->fd != my_aiocb.aio_fildes ) {
 268                         result = EBADF;
 269                         goto ExitRoutine;
 270                 }
 271         }
 272         result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
 273
 274         if ( result != -1 ) {
 275                 *retval = result;
 276                 result = 0;
 277                 goto ExitRoutine;
 278         }
 279
 280         result = EBADF;
 281
 282 ExitRoutine:
 283         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 284                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 285
 286         return( result );
 287
 288 } /* aio_cancel */
 289
 290
 291 /*
 292  * _aio_close - internal function used to clean up async IO requests for
 293  * a file descriptor that is closing.
 294  * THIS MAY BLOCK.
 295  */
 296
 297 __private_extern__ void
 298 _aio_close( struct proc *p, int fd )
 299 {
 300         int                     error, count;
 301
 302         /* quick check to see if there are any async IO requests queued up */
 303         AIO_LOCK;
 304         count = aio_get_all_queues_count( );
 305         AIO_UNLOCK;
 306         if ( count < 1 )
 307                 return;
 308
 309         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 310                           (int)p, fd, 0, 0, 0 );
 311
 312         /* cancel all async IO requests on our todo queues for this file descriptor */
 313         error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
 314         if ( error == AIO_NOTCANCELED ) {
 315                 /*
 316                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 317                  * and file descriptor on the active async IO queue.  Active requests cannot
 318                  * be cancelled so we must wait for them to complete.  We will get a special
 319                  * wake up call on our channel used to sleep for ALL active requests to
 320                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 321                  * when we must wait for all active aio requests.
 322                  */
 323
 324                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 325                                   (int)p, fd, 0, 0, 0 );
 326
 327                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
 328         }
 329
 330         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 331                           (int)p, fd, 0, 0, 0 );
 332
 333         return;
 334
 335 } /* _aio_close */
 336
 337
 338 /*
 339  * aio_error - return the error status associated with the async IO
 340  * request referred to by uap->aiocbp.  The error status is the errno
 341  * value that would be set by the corresponding IO request (read, wrtie,
 342  * fdatasync, or sync).
 343  */
 344
 345 int
 346 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
 347 {
 348         aio_workq_entry                         *entryp;
 349         int                                                     error;
 350
 351         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 352                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 353
 354         AIO_LOCK;
 355
 356         /* quick check to see if there are any async IO requests queued up */
 357         if ( aio_get_all_queues_count( ) < 1 ) {
 358                 error = EINVAL;
 359                 goto ExitRoutine;
 360         }
 361
 362         /* look for a match on our queue of async IO requests that have completed */
 363         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 364                 if ( entryp->uaiocbp == uap->aiocbp ) {
 365                         *retval = entryp->errorval;
 366                         error = 0;
 367                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 368                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 369                         goto ExitRoutine;
 370                 }
 371         }
 372
 373         /* look for a match on our queue of active async IO requests */
 374         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 375                 if ( entryp->uaiocbp == uap->aiocbp ) {
 376                         *retval = EINPROGRESS;
 377                         error = 0;
 378                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 379                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 380                         goto ExitRoutine;
 381                 }
 382         }
 383
 384         /* look for a match on our queue of todo work */
 385         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 386                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 387                         *retval = EINPROGRESS;
 388                         error = 0;
 389                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
 390                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 391                         goto ExitRoutine;
 392                 }
 393         }
 394         error = EINVAL;
 395
 396 ExitRoutine:
 397         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 398                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 399         AIO_UNLOCK;
 400
 401         return( error );
 402
 403 } /* aio_error */
 404
 405
 406 /*
 407  * aio_fsync - asynchronously force all IO operations associated
 408  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 409  * queued at the time of the call to the synchronized completion state.
 410  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 411  * fdatasync() call.
 412  */
 413
 414 int
 415 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
 416 {
 417         int                     error;
 418         int                     fsync_kind;
 419
 420         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 421                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 422
 423         *retval = 0;
 424         /* 0 := O_SYNC for binary backward compatibility with Panther */
 425         if (uap->op == O_SYNC || uap->op == 0)
 426                 fsync_kind = AIO_FSYNC;
 427 #if 0 // we don't support fdatasync() call yet
 428         else if ( uap->op == O_DSYNC )
 429                 fsync_kind = AIO_DSYNC;
 430 #endif
 431         else {
 432                 *retval = -1;
 433                 error = EINVAL;
 434                 goto ExitRoutine;
 435         }
 436
 437         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 438         if ( error != 0 )
 439                 *retval = -1;
 440
 441 ExitRoutine:
 442         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 443                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 444
 445         return( error );
 446
 447 } /* aio_fsync */
 448
 449
 450 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 451  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 452  * (uap->aiocbp->aio_buf).
 453  */
 454
 455 int
 456 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
 457 {
 458         int                     error;
 459
 460         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 461                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 462
 463         *retval = 0;
 464
 465         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 466         if ( error != 0 )
 467                 *retval = -1;
 468
 469         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 470                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 471
 472         return( error );
 473
 474 } /* aio_read */
 475
 476
 477 /*
 478  * aio_return - return the return status associated with the async IO
 479  * request referred to by uap->aiocbp.  The return status is the value
 480  * that would be returned by corresponding IO request (read, wrtie,
 481  * fdatasync, or sync).  This is where we release kernel resources
 482  * held for async IO call associated with the given aiocb pointer.
 483  */
 484
 485 int
 486 aio_return( struct proc *p, struct aio_return_args *uap, user_ssize_t *retval )
 487 {
 488         aio_workq_entry                         *entryp;
 489         int                                                     error;
 490         boolean_t                                       lock_held;
 491
 492         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 493                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 494
 495         AIO_LOCK;
 496         lock_held = TRUE;
 497         *retval = 0;
 498
 499         /* quick check to see if there are any async IO requests queued up */
 500         if ( aio_get_all_queues_count( ) < 1 ) {
 501                 error = EINVAL;
 502                 goto ExitRoutine;
 503         }
 504
 505         /* look for a match on our queue of async IO requests that have completed */
 506         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 507                 if ( entryp->uaiocbp == uap->aiocbp ) {
 508                         TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 509                         aio_anchor.aio_done_count--;
 510                         p->aio_done_count--;
 511
 512                         *retval = entryp->returnval;
 513
 514                         /* we cannot free requests that are still completing */
 515                         if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 516                                 vm_map_t                my_map;
 517
 518                                 my_map = entryp->aio_map;
 519                                 entryp->aio_map = VM_MAP_NULL;
 520                                 AIO_UNLOCK;
 521                                 lock_held = FALSE;
 522                                 aio_free_request( entryp, my_map );
 523                         }
 524                         else
 525                                 /* tell completion code to free this request */
 526                                 entryp->flags |= AIO_DO_FREE;
 527                         error = 0;
 528                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 529                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 530                         goto ExitRoutine;
 531                 }
 532         }
 533
 534         /* look for a match on our queue of active async IO requests */
 535         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 536                 if ( entryp->uaiocbp == uap->aiocbp ) {
 537                         error = EINPROGRESS;
 538                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 539                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 540                         goto ExitRoutine;
 541                 }
 542         }
 543
 544         /* look for a match on our queue of todo work */
 545         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 546                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 547                         error = EINPROGRESS;
 548                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
 549                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 550                         goto ExitRoutine;
 551                 }
 552         }
 553         error = EINVAL;
 554
 555 ExitRoutine:
 556         if ( lock_held )
 557                 AIO_UNLOCK;
 558         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 559                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 560
 561         return( error );
 562
 563 } /* aio_return */
 564
 565
 566 /*
 567  * _aio_exec - internal function used to clean up async IO requests for
 568  * a process that is going away due to exec().  We cancel any async IOs
 569  * we can and wait for those already active.  We also disable signaling
 570  * for cancelled or active aio requests that complete.
 571  * This routine MAY block!
 572  */
 573
 574 __private_extern__ void
 575 _aio_exec( struct proc *p )
 576 {
 577
 578         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 579                           (int)p, 0, 0, 0, 0 );
 580
 581         _aio_exit( p );
 582
 583         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 584                           (int)p, 0, 0, 0, 0 );
 585
 586         return;
 587
 588 } /* _aio_exec */
 589
 590
 591 /*
 592  * _aio_exit - internal function used to clean up async IO requests for
 593  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 594  * we can and wait for those already active.  We also disable signaling
 595  * for cancelled or active aio requests that complete.  This routine MAY block!
 596  */
 597
 598 __private_extern__ void
 599 _aio_exit( struct proc *p )
 600 {
 601         int                                             error, count;
 602         aio_workq_entry                 *entryp;
 603
 604         /* quick check to see if there are any async IO requests queued up */
 605         AIO_LOCK;
 606         count = aio_get_all_queues_count( );
 607         AIO_UNLOCK;
 608         if ( count < 1 ) {
 609                 return;
 610         }
 611
 612         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 613                           (int)p, 0, 0, 0, 0 );
 614
 615         /*
 616          * cancel async IO requests on the todo work queue and wait for those
 617          * already active to complete.
 618          */
 619         error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
 620         if ( error == AIO_NOTCANCELED ) {
 621                 /*
 622                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 623                  * on the active async IO queue.  Active requests cannot be cancelled so we
 624                  * must wait for them to complete.  We will get a special wake up call on
 625                  * our channel used to sleep for ALL active requests to complete.  This sleep
 626                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 627                  * active aio requests.
 628                  */
 629
 630                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 631                                   (int)p, 0, 0, 0, 0 );
 632
 633                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
 634         }
 635
 636         /* release all aio resources used by this process */
 637         AIO_LOCK;
 638         entryp = TAILQ_FIRST( &p->aio_doneq );
 639         while ( entryp != NULL ) {
 640                 aio_workq_entry                 *next_entryp;
 641
 642                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 643                 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 644                 aio_anchor.aio_done_count--;
 645                 p->aio_done_count--;
 646
 647                 /* we cannot free requests that are still completing */
 648                 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 649                         vm_map_t                my_map;
 650
 651                         my_map = entryp->aio_map;
 652                         entryp->aio_map = VM_MAP_NULL;
 653                         AIO_UNLOCK;
 654                         aio_free_request( entryp, my_map );
 655
 656                         /* need to start over since aio_doneq may have been */
 657                         /* changed while we were away.  */
 658                         AIO_LOCK;
 659                         entryp = TAILQ_FIRST( &p->aio_doneq );
 660                         continue;
 661                 }
 662                 else
 663                         /* tell completion code to free this request */
 664                         entryp->flags |= AIO_DO_FREE;
 665                 entryp = next_entryp;
 666         }
 667         AIO_UNLOCK;
 668
 669         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 670                           (int)p, 0, 0, 0, 0 );
 671
 672         return;
 673
 674 } /* _aio_exit */
 675
 676
 677 /*
 678  * do_aio_cancel - cancel async IO requests (if possible).  We get called by
 679  * aio_cancel, close, and at exit.
 680  * There are three modes of operation: 1) cancel all async IOs for a process -
 681  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 682  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 683  * aiocbp.
 684  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 685  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 686  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 687  * were already complete.
 688  * WARNING - do not deference aiocbp in this routine, it may point to user
 689  * land data that has not been copied in (when called from aio_cancel() )
 690  */
 691
 692 static int
 693 do_aio_cancel(  struct proc *p, int fd, user_addr_t aiocbp,
 694                                 boolean_t wait_for_completion, boolean_t disable_notification )
 695 {
 696         aio_workq_entry                 *entryp;
 697         int                                             result;
 698
 699         result = -1;
 700
 701         /* look for a match on our queue of async todo work. */
 702         AIO_LOCK;
 703         entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 704         while ( entryp != NULL ) {
 705                 aio_workq_entry                 *next_entryp;
 706
 707                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 708                 if ( p == entryp->procp ) {
 709                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 710                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 711                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 712                                 /* we found a match so we remove the entry from the */
 713                                 /* todo work queue and place it on the done queue */
 714                                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 715                                 aio_anchor.aio_async_workq_count--;
 716                                 entryp->errorval = ECANCELED;
 717                                 entryp->returnval = -1;
 718                                 if ( disable_notification )
 719                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 720                                 result = AIO_CANCELED;
 721
 722                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
 723                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 724
 725                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 726                                 aio_anchor.aio_done_count++;
 727                                 p->aio_done_count++;
 728                                 entryp->flags |= AIO_COMPLETION;
 729                                 AIO_UNLOCK;
 730
 731                                 /* do completion processing for this request */
 732                                 do_aio_completion( entryp );
 733
 734                                 AIO_LOCK;
 735                                 entryp->flags &= ~AIO_COMPLETION;
 736                                 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
 737                                         vm_map_t                my_map;
 738
 739                                         my_map = entryp->aio_map;
 740                                         entryp->aio_map = VM_MAP_NULL;
 741                                         AIO_UNLOCK;
 742                                         aio_free_request( entryp, my_map );
 743                                 }
 744                                 else
 745                                         AIO_UNLOCK;
 746
 747                                 if ( aiocbp != USER_ADDR_NULL ) {
 748                                         return( result );
 749                                 }
 750
 751                                 /* need to start over since aio_async_workq may have been */
 752                                 /* changed while we were away doing completion processing.  */
 753                                 AIO_LOCK;
 754                                 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 755                                 continue;
 756                         }
 757                 }
 758                 entryp = next_entryp;
 759         } /* while... */
 760
 761         /*
 762          * look for a match on our queue of synchronous todo work.  This will
 763          * be a rare occurrence but could happen if a process is terminated while
 764          * processing a lio_listio call.
 765          */
 766         entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 767         while ( entryp != NULL ) {
 768                 aio_workq_entry                 *next_entryp;
 769
 770                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 771                 if ( p == entryp->procp ) {
 772                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 773                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 774                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 775                                 /* we found a match so we remove the entry from the */
 776                                 /* todo work queue and place it on the done queue */
 777                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 778                                 aio_anchor.lio_sync_workq_count--;
 779                                 entryp->errorval = ECANCELED;
 780                                 entryp->returnval = -1;
 781                                 if ( disable_notification )
 782                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 783                                 result = AIO_CANCELED;
 784
 785                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
 786                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 787
 788                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 789                                 aio_anchor.aio_done_count++;
 790                                 p->aio_done_count++;
 791                                 if ( aiocbp != USER_ADDR_NULL ) {
 792                                         AIO_UNLOCK;
 793                                         return( result );
 794                                 }
 795                         }
 796                 }
 797                 entryp = next_entryp;
 798         } /* while... */
 799
 800         /*
 801          * look for a match on our queue of active async IO requests and
 802          * return AIO_NOTCANCELED result.
 803          */
 804         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 805                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 806                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 807                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 808                         result = AIO_NOTCANCELED;
 809
 810                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
 811                                                   (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 812
 813                         if ( wait_for_completion )
 814                                 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
 815                         if ( disable_notification )
 816                                 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 817                         if ( aiocbp != USER_ADDR_NULL ) {
 818                                 AIO_UNLOCK;
 819                                 return( result );
 820                         }
 821                 }
 822         }
 823
 824         /*
 825          * if we didn't find any matches on the todo or active queues then look for a
 826          * match on our queue of async IO requests that have completed and if found
 827          * return AIO_ALLDONE result.
 828          */
 829         if ( result == -1 ) {
 830                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 831                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 832                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 833                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 834                                 result = AIO_ALLDONE;
 835
 836                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
 837                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 838
 839                                 if ( aiocbp != USER_ADDR_NULL ) {
 840                                         AIO_UNLOCK;
 841                                         return( result );
 842                                 }
 843                         }
 844                 }
 845         }
 846         AIO_UNLOCK;
 847
 848         return( result );
 849
 850 } /* do_aio_cancel */
 851
 852
 853 /*
 854  * aio_suspend - suspend the calling thread until at least one of the async
 855  * IO operations referenced by uap->aiocblist has completed, until a signal
 856  * interrupts the function, or uap->timeoutp time interval (optional) has
 857  * passed.
 858  * Returns 0 if one or more async IOs have completed else -1 and errno is
 859  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
 860  * woke us up.
 861  */
 862
 863 int
 864 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
 865 {
 866         int                                     error;
 867         int                                     i, count;
 868         uint64_t                        abstime;
 869         struct user_timespec ts;
 870         aio_workq_entry         *entryp;
 871         user_addr_t                     *aiocbpp;
 872
 873         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
 874                           (int)p, uap->nent, 0, 0, 0 );
 875
 876         *retval = -1;
 877         abstime = 0;
 878         aiocbpp = NULL;
 879
 880         /* quick check to see if there are any async IO requests queued up */
 881         AIO_LOCK;
 882         count = aio_get_all_queues_count( );
 883         AIO_UNLOCK;
 884         if ( count < 1 ) {
 885                 error = EINVAL;
 886                 goto ExitThisRoutine;
 887         }
 888
 889         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
 890                 error = EINVAL;
 891                 goto ExitThisRoutine;
 892         }
 893
 894         if ( uap->timeoutp != USER_ADDR_NULL ) {
 895                 if ( proc_is64bit(p) ) {
 896                         error = copyin( uap->timeoutp, &ts, sizeof(ts) );
 897                 }
 898                 else {
 899                         struct timespec temp;
 900                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
 901                         if ( error == 0 ) {
 902                                 ts.tv_sec = temp.tv_sec;
 903                                 ts.tv_nsec = temp.tv_nsec;
 904                         }
 905                 }
 906                 if ( error != 0 ) {
 907                         error = EAGAIN;
 908                         goto ExitThisRoutine;
 909                 }
 910
 911                 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
 912                         error = EINVAL;
 913                         goto ExitThisRoutine;
 914                 }
 915
 916                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
 917                                                                          &abstime );
 918                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
 919         }
 920
 921         /* we reserve enough space for largest possible pointer size */
 922         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
 923         if ( aiocbpp == NULL ) {
 924                 error = EAGAIN;
 925                 goto ExitThisRoutine;
 926         }
 927
 928         /* copyin our aiocb pointers from list */
 929         error = copyin( uap->aiocblist, aiocbpp,
 930                                         proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
 931                                                                         : (uap->nent * sizeof(uintptr_t)) );
 932         if ( error != 0 ) {
 933                 error = EAGAIN;
 934                 goto ExitThisRoutine;
 935         }
 936
 937         /* we depend on a list of user_addr_t's so we need to munge and expand */
 938         /* when these pointers came from a 32-bit process */
 939         if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
 940                 /* position to the last entry and work back from there */
 941                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
 942                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
 943                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
 944                         *my_addrp = (user_addr_t) (*my_ptrp);
 945                 }
 946         }
 947
 948         /* check list of aio requests to see if any have completed */
 949         AIO_LOCK;
 950         for ( i = 0; i < uap->nent; i++ ) {
 951                 user_addr_t     aiocbp;
 952
 953                 /* NULL elements are legal so check for 'em */
 954                 aiocbp = *(aiocbpp + i);
 955                 if ( aiocbp == USER_ADDR_NULL )
 956                         continue;
 957
 958                 /* return immediately if any aio request in the list is done */
 959                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 960                         if ( entryp->uaiocbp == aiocbp ) {
 961                                 *retval = 0;
 962                                 error = 0;
 963                                 AIO_UNLOCK;
 964                                 goto ExitThisRoutine;
 965                         }
 966                 }
 967         } /* for ( ; i < uap->nent; ) */
 968
 969         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
 970                           (int)p, uap->nent, 0, 0, 0 );
 971
 972         /*
 973          * wait for an async IO to complete or a signal fires or timeout expires.
 974          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
 975          * interrupts us.  If an async IO completes before a signal fires or our
 976          * timeout expires, we get a wakeup call from aio_work_thread().
 977          */
 978         assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
 979         AIO_UNLOCK;
 980
 981         error = thread_block( THREAD_CONTINUE_NULL );
 982
 983         if ( error == THREAD_AWAKENED ) {
 984                 /* got our wakeup call from aio_work_thread() */
 985                 *retval = 0;
 986                 error = 0;
 987         }
 988         else if ( error == THREAD_TIMED_OUT ) {
 989                 /* our timeout expired */
 990                 error = EAGAIN;
 991         }
 992         else {
 993                 /* we were interrupted */
 994                 error = EINTR;
 995         }
 996
 997 ExitThisRoutine:
 998         if ( aiocbpp != NULL )
 999                 FREE( aiocbpp, M_TEMP );
1000
1001         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1002                           (int)p, uap->nent, error, 0, 0 );
1003
1004         return( error );
1005
1006 } /* aio_suspend */
1007
1008
1009 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1010  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1011  * (uap->aiocbp->aio_buf).
1012  */
1013
1014 int
1015 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1016 {
1017         int                     error;
1018
1019         *retval = 0;
1020
1021         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1022                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1023
1024         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1025         if ( error != 0 )
1026                 *retval = -1;
1027
1028         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1029                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1030
1031         return( error );
1032
1033 } /* aio_write */
1034
1035
1036 /*
1037  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs
1038  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1039  * The caller gets error and return status for each aiocb in the list via aio_error
1040  * and aio_return.  We must keep completed requests until released by the
1041  * aio_return call.
1042  */
1043
1044 int
1045 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1046 {
1047         int                                                     i;
1048         int                                                     call_result;
1049         int                                                     result;
1050         long                                            group_tag;
1051         aio_workq_entry *                       *entryp_listp;
1052         user_addr_t                                     *aiocbpp;
1053
1054         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1055                           (int)p, uap->nent, uap->mode, 0, 0 );
1056
1057         entryp_listp = NULL;
1058         aiocbpp = NULL;
1059         call_result = -1;
1060         *retval = -1;
1061         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1062                 call_result = EINVAL;
1063                 goto ExitRoutine;
1064         }
1065
1066         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1067                 call_result = EINVAL;
1068                 goto ExitRoutine;
1069         }
1070
1071         /*
1072          * we use group_tag to mark IO requests for delayed completion processing
1073          * which means we wait until all IO requests in the group have completed
1074          * before we either return to the caller when mode is LIO_WAIT or signal
1075          * user when mode is LIO_NOWAIT.
1076          */
1077         group_tag = random();
1078
1079         /*
1080          * allocate a list of aio_workq_entry pointers that we will use to queue
1081          * up all our requests at once while holding our lock.
1082          */
1083         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1084         if ( entryp_listp == NULL ) {
1085                 call_result = EAGAIN;
1086                 goto ExitRoutine;
1087         }
1088
1089         /* we reserve enough space for largest possible pointer size */
1090         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1091         if ( aiocbpp == NULL ) {
1092                 call_result = EAGAIN;
1093                 goto ExitRoutine;
1094         }
1095
1096         /* copyin our aiocb pointers from list */
1097         result = copyin( uap->aiocblist, aiocbpp,
1098                                         IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1099                                                                                 : (uap->nent * sizeof(uintptr_t)) );
1100         if ( result != 0 ) {
1101                 call_result = EAGAIN;
1102                 goto ExitRoutine;
1103         }
1104
1105         /* we depend on a list of user_addr_t's so we need to munge and expand */
1106         /* when these pointers came from a 32-bit process */
1107         if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1108                 /* position to the last entry and work back from there */
1109                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1110                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1111                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1112                         *my_addrp = (user_addr_t) (*my_ptrp);
1113                 }
1114         }
1115
1116         /* process list of aio requests */
1117         for ( i = 0; i < uap->nent; i++ ) {
1118                 user_addr_t my_aiocbp;
1119
1120                 *(entryp_listp + i) = NULL;
1121                 my_aiocbp = *(aiocbpp + i);
1122
1123                 /* NULL elements are legal so check for 'em */
1124                 if ( my_aiocbp == USER_ADDR_NULL )
1125                         continue;
1126
1127                 if ( uap->mode == LIO_NOWAIT )
1128                         result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1129                                                                                          group_tag, (entryp_listp + i) );
1130                 else
1131                         result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1132                                                                                         (entryp_listp + i) );
1133
1134                 if ( result != 0 && call_result == -1 )
1135                         call_result = result;
1136         }
1137
1138         /*
1139          * we need to protect this section since we do not want any of these grouped
1140          * IO requests to begin until we have them all on the queue.
1141          */
1142         AIO_LOCK;
1143         for ( i = 0; i < uap->nent; i++ ) {
1144                 aio_workq_entry                         *entryp;
1145
1146                 /* NULL elements are legal so check for 'em */
1147                 entryp = *(entryp_listp + i);
1148                 if ( entryp == NULL )
1149                         continue;
1150
1151                 /* check our aio limits to throttle bad or rude user land behavior */
1152                 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1153                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1154                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1155                         vm_map_t                my_map;
1156
1157                         my_map = entryp->aio_map;
1158                         entryp->aio_map = VM_MAP_NULL;
1159                         if ( call_result == -1 )
1160                                 call_result = EAGAIN;
1161                         AIO_UNLOCK;
1162                         aio_free_request( entryp, my_map );
1163                         AIO_LOCK;
1164                         continue;
1165                 }
1166
1167                 /* place the request on the appropriate queue */
1168                 if ( uap->mode == LIO_NOWAIT ) {
1169                         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1170                         aio_anchor.aio_async_workq_count++;
1171
1172                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1173                                           (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1174                 }
1175                 else {
1176                         TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1177                         aio_anchor.lio_sync_workq_count++;
1178                 }
1179         }
1180
1181         if ( uap->mode == LIO_NOWAIT ) {
1182                 /* caller does not want to wait so we'll fire off a worker thread and return */
1183                 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1184         }
1185         else {
1186                 aio_workq_entry                 *entryp;
1187                 int                                     error;
1188
1189                 /*
1190                  * mode is LIO_WAIT - handle the IO requests now.
1191                  */
1192                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1193                 while ( entryp != NULL ) {
1194                         if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1195
1196                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1197                                 aio_anchor.lio_sync_workq_count--;
1198                                 AIO_UNLOCK;
1199
1200                                 if ( (entryp->flags & AIO_READ) != 0 ) {
1201                                         error = do_aio_read( entryp );
1202                                 }
1203                                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1204                                         error = do_aio_write( entryp );
1205                                 }
1206                                 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1207                                         error = do_aio_fsync( entryp );
1208                                 }
1209                                 else {
1210                                         printf( "%s - unknown aio request - flags 0x%02X \n",
1211                                                         __FUNCTION__, entryp->flags );
1212                                         error = EINVAL;
1213                                 }
1214                                 entryp->errorval = error;
1215                                 if ( error != 0 && call_result == -1 )
1216                                         call_result = EIO;
1217
1218                                 AIO_LOCK;
1219                                 /* we're done with the IO request so move it on the done queue */
1220                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1221                                 aio_anchor.aio_done_count++;
1222                                 p->aio_done_count++;
1223
1224                                 /* need to start over since lio_sync_workq may have been changed while we */
1225                                 /* were away doing the IO.  */
1226                                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1227                                 continue;
1228                         } /* p == entryp->procp */
1229
1230                         entryp = TAILQ_NEXT( entryp, aio_workq_link );
1231         } /* while ( entryp != NULL ) */
1232         } /* uap->mode == LIO_WAIT */
1233         AIO_UNLOCK;
1234
1235         /* call_result == -1 means we had no trouble queueing up requests */
1236         if ( call_result == -1 ) {
1237                 call_result = 0;
1238                 *retval = 0;
1239         }
1240
1241 ExitRoutine:
1242         if ( entryp_listp != NULL )
1243                 FREE( entryp_listp, M_TEMP );
1244         if ( aiocbpp != NULL )
1245                 FREE( aiocbpp, M_TEMP );
1246
1247         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1248                           (int)p, call_result, 0, 0, 0 );
1249
1250         return( call_result );
1251
1252 } /* lio_listio */
1253
1254
1255 /*
1256  * aio worker thread.  this is where all the real work gets done.
1257  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1258  * after new work is queued up.
1259  */
1260
1261 static void
1262 aio_work_thread( void )
1263 {
1264         aio_workq_entry                 *entryp;
1265
1266         for( ;; ) {
1267                 AIO_LOCK;
1268                 entryp = aio_get_some_work();
1269         if ( entryp == NULL ) {
1270                 /*
1271                  * aio worker threads wait for some work to get queued up
1272                  * by aio_queue_async_request.  Once some work gets queued
1273                  * it will wake up one of these worker threads just before
1274                  * returning to our caller in user land.
1275                  */
1276                         assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1277                         AIO_UNLOCK;
1278
1279                         thread_block( (thread_continue_t)aio_work_thread );
1280                         /* NOT REACHED */
1281         }
1282                 else {
1283                         int                     error;
1284                         vm_map_t                currentmap;
1285                         vm_map_t                oldmap = VM_MAP_NULL;
1286                         task_t                  oldaiotask = TASK_NULL;
1287                         struct uthread  *uthreadp = NULL;
1288
1289                         AIO_UNLOCK;
1290
1291                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1292                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1293
1294                         /*
1295                          * Assume the target's address space identity for the duration
1296                          * of the IO.
1297                          */
1298                         currentmap = get_task_map( (current_proc())->task );
1299                         if ( currentmap != entryp->aio_map ) {
1300                                 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1301                                 oldaiotask = uthreadp->uu_aio_task;
1302                                 uthreadp->uu_aio_task = entryp->procp->task;
1303                                 oldmap = vm_map_switch( entryp->aio_map );
1304                         }
1305
1306                         if ( (entryp->flags & AIO_READ) != 0 ) {
1307                                 error = do_aio_read( entryp );
1308                         }
1309                         else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1310                                 error = do_aio_write( entryp );
1311                         }
1312                         else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1313                                 error = do_aio_fsync( entryp );
1314                         }
1315                         else {
1316                                 printf( "%s - unknown aio request - flags 0x%02X \n",
1317                                                 __FUNCTION__, entryp->flags );
1318                                 error = EINVAL;
1319                         }
1320                         entryp->errorval = error;
1321                         if ( currentmap != entryp->aio_map ) {
1322                                 (void) vm_map_switch( oldmap );
1323                                 uthreadp->uu_aio_task = oldaiotask;
1324                         }
1325
1326                         /* we're done with the IO request so pop it off the active queue and */
1327                         /* push it on the done queue */
1328                         AIO_LOCK;
1329                         TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1330                         aio_anchor.aio_active_count--;
1331                         entryp->procp->aio_active_count--;
1332                         TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1333                         aio_anchor.aio_done_count++;
1334                         entryp->procp->aio_done_count++;
1335                         entryp->flags |= AIO_COMPLETION;
1336
1337                         /* remove our reference to the user land map. */
1338                         if ( VM_MAP_NULL != entryp->aio_map ) {
1339                                 vm_map_t                my_map;
1340
1341                                 my_map = entryp->aio_map;
1342                                 entryp->aio_map = VM_MAP_NULL;
1343                                 AIO_UNLOCK;  /* must unlock before calling vm_map_deallocate() */
1344                                 vm_map_deallocate( my_map );
1345                         }
1346                         else {
1347                                 AIO_UNLOCK;
1348                         }
1349
1350                         do_aio_completion( entryp );
1351
1352                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1353                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1354                                                   entryp->returnval, 0 );
1355
1356                         AIO_LOCK;
1357                         entryp->flags &= ~AIO_COMPLETION;
1358                         if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1359                                 vm_map_t                my_map;
1360
1361                                 my_map = entryp->aio_map;
1362                                 entryp->aio_map = VM_MAP_NULL;
1363                                 AIO_UNLOCK;
1364                                 aio_free_request( entryp, my_map );
1365                         }
1366                         else
1367                                 AIO_UNLOCK;
1368                 }
1369         } /* for ( ;; ) */
1370
1371         /* NOT REACHED */
1372
1373 } /* aio_work_thread */
1374
1375
1376 /*
1377  * aio_get_some_work - get the next async IO request that is ready to be executed.
1378  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1379  * IO requests at the time the aio_fsync call came in have completed.
1380  * NOTE - AIO_LOCK must be held by caller
1381  */
1382
1383 static aio_workq_entry *
1384 aio_get_some_work( void )
1385 {
1386         aio_workq_entry                         *entryp;
1387
1388         /* pop some work off the work queue and add to our active queue */
1389         for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1390                   entryp != NULL;
1391                   entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1392
1393                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1394                         /* leave aio_fsync calls on the work queue if there are IO */
1395                         /* requests on the active queue for the same file descriptor. */
1396                         if ( aio_delay_fsync_request( entryp ) ) {
1397
1398                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1399                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1400                                 continue;
1401                         }
1402                 }
1403                 break;
1404         }
1405
1406         if ( entryp != NULL ) {
1407                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1408                 aio_anchor.aio_async_workq_count--;
1409                 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1410                 aio_anchor.aio_active_count++;
1411                 entryp->procp->aio_active_count++;
1412         }
1413
1414         return( entryp );
1415
1416 } /* aio_get_some_work */
1417
1418
1419 /*
1420  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1421  * this time.  Delay will happen when there are any active IOs for the same file
1422  * descriptor that were queued at time the aio_sync call was queued.
1423  * NOTE - AIO_LOCK must be held by caller
1424  */
1425 static boolean_t
1426 aio_delay_fsync_request( aio_workq_entry *entryp )
1427 {
1428         aio_workq_entry                 *my_entryp;
1429
1430         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1431                 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1432                          entryp->uaiocbp == my_entryp->fsyncp &&
1433                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1434                         return( TRUE );
1435                 }
1436         }
1437
1438         return( FALSE );
1439
1440 } /* aio_delay_fsync_request */
1441
1442
1443 /*
1444  * aio_queue_async_request - queue up an async IO request on our work queue then
1445  * wake up one of our worker threads to do the actual work.  We get a reference
1446  * to our caller's user land map in order to keep it around while we are
1447  * processing the request.
1448  */
1449
1450 static int
1451 aio_queue_async_request( struct proc *procp, user_addr_t aiocbp, int kindOfIO )
1452 {
1453         aio_workq_entry                 *entryp;
1454         int                                             result;
1455
1456         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1457         if ( entryp == NULL ) {
1458                 result = EAGAIN;
1459                 goto error_exit;
1460         }
1461         bzero( entryp, sizeof(*entryp) );
1462
1463         /* fill in the rest of the aio_workq_entry */
1464         entryp->procp = procp;
1465         entryp->uaiocbp = aiocbp;
1466         entryp->flags |= kindOfIO;
1467         entryp->aio_map = VM_MAP_NULL;
1468
1469         if ( !IS_64BIT_PROCESS(procp) ) {
1470                 struct aiocb aiocb32;
1471
1472                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1473                 if ( result == 0 )
1474                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1475         } else
1476                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1477
1478         if ( result != 0 ) {
1479                 result = EAGAIN;
1480                 goto error_exit;
1481         }
1482
1483         /* do some more validation on the aiocb and embedded file descriptor */
1484         result = aio_validate( entryp );
1485         if ( result != 0 )
1486                 goto error_exit;
1487
1488         /* get a reference to the user land map in order to keep it around */
1489         entryp->aio_map = get_task_map( procp->task );
1490         vm_map_reference( entryp->aio_map );
1491
1492         AIO_LOCK;
1493
1494         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1495                 AIO_UNLOCK;
1496                 result = EAGAIN;
1497                 goto error_exit;
1498         }
1499
1500         /* check our aio limits to throttle bad or rude user land behavior */
1501         if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1502                  aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1503                 AIO_UNLOCK;
1504                 result = EAGAIN;
1505                 goto error_exit;
1506         }
1507
1508         /*
1509          * aio_fsync calls sync up all async IO requests queued at the time
1510          * the aio_fsync call was made.  So we mark each currently queued async
1511          * IO with a matching file descriptor as must complete before we do the
1512          * fsync.  We set the fsyncp field of each matching async IO
1513          * request with the aiocb pointer passed in on the aio_fsync call to
1514          * know which IOs must complete before we process the aio_fsync call.
1515          */
1516         if ( (kindOfIO & AIO_FSYNC) != 0 )
1517                 aio_mark_requests( entryp );
1518
1519         /* queue up on our aio asynchronous work queue */
1520         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1521         aio_anchor.aio_async_workq_count++;
1522
1523         wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1524         AIO_UNLOCK;
1525
1526         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1527                           (int)procp, (int)aiocbp, 0, 0, 0 );
1528
1529         return( 0 );
1530
1531 error_exit:
1532         if ( entryp != NULL ) {
1533                 /* this entry has not been queued up so no worries about unlocked */
1534                 /* state and aio_map */
1535                 aio_free_request( entryp, entryp->aio_map );
1536         }
1537
1538         return( result );
1539
1540 } /* aio_queue_async_request */
1541
1542
1543 /*
1544  * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1545  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1546  * our caller.  We get a reference to our caller's user land map in order to keep
1547  * it around while we are processing the request.
1548  * lio_listio calls behave differently at completion they do completion notification
1549  * when all async IO requests have completed.  We use group_tag to tag IO requests
1550  * that behave in the delay notification manner.
1551  */
1552
1553 static int
1554 lio_create_async_entry( struct proc *procp, user_addr_t aiocbp,
1555                                                  user_addr_t sigp, long group_tag,
1556                                                  aio_workq_entry **entrypp )
1557 {
1558         aio_workq_entry                         *entryp;
1559         int                                                     result;
1560
1561         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1562         if ( entryp == NULL ) {
1563                 result = EAGAIN;
1564                 goto error_exit;
1565         }
1566         bzero( entryp, sizeof(*entryp) );
1567
1568         /* fill in the rest of the aio_workq_entry */
1569         entryp->procp = procp;
1570         entryp->uaiocbp = aiocbp;
1571         entryp->flags |= AIO_LIO;
1572         entryp->group_tag = group_tag;
1573         entryp->aio_map = VM_MAP_NULL;
1574
1575         if ( !IS_64BIT_PROCESS(procp) ) {
1576                 struct aiocb aiocb32;
1577
1578                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1579                 if ( result == 0 )
1580                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1581         } else
1582                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1583
1584         if ( result != 0 ) {
1585                 result = EAGAIN;
1586                 goto error_exit;
1587         }
1588
1589         /* look for lio_listio LIO_NOP requests and ignore them. */
1590         /* Not really an error, but we need to free our aio_workq_entry.  */
1591         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1592                 result = 0;
1593                 goto error_exit;
1594         }
1595
1596         /* use sigevent passed in to lio_listio for each of our calls, but only */
1597         /* do completion notification after the last request completes. */
1598         if ( sigp != USER_ADDR_NULL ) {
1599                 if ( !IS_64BIT_PROCESS(procp) ) {
1600                         struct sigevent sigevent32;
1601
1602                         result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1603                         if ( result == 0 ) {
1604                                 /* also need to munge aio_sigevent since it contains pointers */
1605                                 /* special case here.  since we do not know if sigev_value is an */
1606                                 /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
1607                                 /* means if we send this info back to user space we need to remember */
1608                                 /* sigev_value was not expanded for the 32-bit case.  */
1609                                 /* NOTE - this does NOT affect us since we don't support sigev_value */
1610                                 /* yet in the aio context.  */
1611                                 //LP64
1612                                 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1613                                 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1614                                 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1615                                         sigevent32.sigev_value.sival_int;
1616                                 entryp->aiocb.aio_sigevent.sigev_notify_function =
1617                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1618                                 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1619                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1620                         }
1621                 } else
1622                         result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1623
1624                 if ( result != 0 ) {
1625                         result = EAGAIN;
1626                         goto error_exit;
1627                 }
1628         }
1629
1630         /* do some more validation on the aiocb and embedded file descriptor */
1631         result = aio_validate( entryp );
1632         if ( result != 0 )
1633                 goto error_exit;
1634
1635         /* get a reference to the user land map in order to keep it around */
1636         entryp->aio_map = get_task_map( procp->task );
1637         vm_map_reference( entryp->aio_map );
1638
1639         *entrypp = entryp;
1640         return( 0 );
1641
1642 error_exit:
1643         if ( entryp != NULL )
1644                 zfree( aio_workq_zonep, entryp );
1645
1646         return( result );
1647
1648 } /* lio_create_async_entry */
1649
1650
1651 /*
1652  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1653  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp
1654  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp
1655  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1656  * NOTE - AIO_LOCK must be held by caller
1657  */
1658
1659 static void
1660 aio_mark_requests( aio_workq_entry *entryp )
1661 {
1662         aio_workq_entry                 *my_entryp;
1663
1664         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1665                 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1666                         my_entryp->fsyncp = entryp->uaiocbp;
1667                 }
1668         }
1669
1670         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1671                 if ( entryp->procp == my_entryp->procp &&
1672                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1673                         my_entryp->fsyncp = entryp->uaiocbp;
1674                 }
1675         }
1676
1677 } /* aio_mark_requests */
1678
1679
1680 /*
1681  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1682  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1683  * our caller.
1684  * lio_listio calls behave differently at completion they do completion notification
1685  * when all async IO requests have completed.  We use group_tag to tag IO requests
1686  * that behave in the delay notification manner.
1687  */
1688
1689 static int
1690 lio_create_sync_entry( struct proc *procp, user_addr_t aiocbp,
1691                                                 long group_tag, aio_workq_entry **entrypp )
1692 {
1693         aio_workq_entry                         *entryp;
1694         int                                                     result;
1695
1696         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1697         if ( entryp == NULL ) {
1698                 result = EAGAIN;
1699                 goto error_exit;
1700         }
1701         bzero( entryp, sizeof(*entryp) );
1702
1703         /* fill in the rest of the aio_workq_entry */
1704         entryp->procp = procp;
1705         entryp->uaiocbp = aiocbp;
1706         entryp->flags |= AIO_LIO;
1707         entryp->group_tag = group_tag;
1708         entryp->aio_map = VM_MAP_NULL;
1709
1710         if ( !IS_64BIT_PROCESS(procp) ) {
1711                 struct aiocb aiocb32;
1712
1713                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1714                 if ( result == 0 )
1715                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1716         } else
1717                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1718
1719         if ( result != 0 ) {
1720                 result = EAGAIN;
1721                 goto error_exit;
1722         }
1723
1724         /* look for lio_listio LIO_NOP requests and ignore them. */
1725         /* Not really an error, but we need to free our aio_workq_entry.  */
1726         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1727                 result = 0;
1728                 goto error_exit;
1729         }
1730
1731         result = aio_validate( entryp );
1732         if ( result != 0 ) {
1733                 goto error_exit;
1734         }
1735
1736         *entrypp = entryp;
1737         return( 0 );
1738
1739 error_exit:
1740         if ( entryp != NULL )
1741                 zfree( aio_workq_zonep, entryp );
1742
1743         return( result );
1744
1745 } /* lio_create_sync_entry */
1746
1747
1748 /*
1749  * aio_free_request - remove our reference on the user land map and
1750  * free the work queue entry resources.
1751  * We are not holding the lock here thus aio_map is passed in and
1752  * zeroed while we did have the lock.
1753  */
1754
1755 static int
1756 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1757 {
1758         /* remove our reference to the user land map. */
1759         if ( VM_MAP_NULL != the_map ) {
1760                 vm_map_deallocate( the_map );
1761         }
1762
1763         zfree( aio_workq_zonep, entryp );
1764
1765         return( 0 );
1766
1767 } /* aio_free_request */
1768
1769
1770 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1771  */
1772
1773 static int
1774 aio_validate( aio_workq_entry *entryp )
1775 {
1776         struct fileproc                                 *fp;
1777         int                                                     flag;
1778         int                                                     result;
1779
1780         result = 0;
1781
1782         if ( (entryp->flags & AIO_LIO) != 0 ) {
1783                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1784                         entryp->flags |= AIO_READ;
1785                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1786                         entryp->flags |= AIO_WRITE;
1787                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1788                         return( 0 );
1789                 else
1790                         return( EINVAL );
1791         }
1792
1793         flag = FREAD;
1794         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1795                 flag = FWRITE;
1796         }
1797
1798         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1799                 // LP64todo - does max value for aio_nbytes need to grow?
1800                 if ( entryp->aiocb.aio_nbytes > INT_MAX         ||
1801                          entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1802                          entryp->aiocb.aio_offset < 0 )
1803                         return( EINVAL );
1804         }
1805
1806         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify
1807          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value,
1808          * sigev_notify_function, and sigev_notify_attributes are ignored.
1809          */
1810         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1811                 int             signum;
1812                 /* make sure we have a valid signal number */
1813                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1814                 if ( signum <= 0 || signum >= NSIG ||
1815                          signum == SIGKILL || signum == SIGSTOP )
1816                         return (EINVAL);
1817         }
1818         else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1819                 return (EINVAL);
1820
1821         /* validate the file descriptor and that the file was opened
1822          * for the appropriate read / write access.
1823          */
1824         proc_fdlock(entryp->procp);
1825
1826         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1827         if ( result == 0 ) {
1828                 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1829                         /* we don't have read or write access */
1830                         result = EBADF;
1831                 }
1832                 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1833                         /* this is not a file */
1834                         result = ESPIPE;
1835                 } else
1836                         fp->f_flags |= FP_AIOISSUED;
1837
1838                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1839         }
1840         else {
1841                 result = EBADF;
1842         }
1843
1844         proc_fdunlock(entryp->procp);
1845
1846         return( result );
1847
1848 } /* aio_validate */
1849
1850
1851 /*
1852  * aio_get_process_count - runs through our queues that hold outstanding
1853  * async IO reqests and totals up number of requests for the given
1854  * process.
1855  * NOTE - caller must hold aio lock!
1856  */
1857
1858 static int
1859 aio_get_process_count( struct proc *procp )
1860 {
1861         aio_workq_entry                         *entryp;
1862         int                                                     count;
1863
1864         /* begin with count of completed async IO requests for this process */
1865         count = procp->aio_done_count;
1866
1867         /* add in count of active async IO requests for this process */
1868         count += procp->aio_active_count;
1869
1870         /* look for matches on our queue of asynchronous todo work */
1871         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1872                 if ( procp == entryp->procp ) {
1873                         count++;
1874                 }
1875         }
1876
1877         /* look for matches on our queue of synchronous todo work */
1878         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1879                 if ( procp == entryp->procp ) {
1880                         count++;
1881                 }
1882         }
1883
1884         return( count );
1885
1886 } /* aio_get_process_count */
1887
1888
1889 /*
1890  * aio_get_all_queues_count - get total number of entries on all aio work queues.
1891  * NOTE - caller must hold aio lock!
1892  */
1893
1894 static int
1895 aio_get_all_queues_count( void )
1896 {
1897         int                                                     count;
1898
1899         count = aio_anchor.aio_async_workq_count;
1900         count += aio_anchor.lio_sync_workq_count;
1901         count += aio_anchor.aio_active_count;
1902         count += aio_anchor.aio_done_count;
1903
1904         return( count );
1905
1906 } /* aio_get_all_queues_count */
1907
1908
1909 /*
1910  * do_aio_completion.  Handle async IO completion.
1911  */
1912
1913 static void
1914 do_aio_completion( aio_workq_entry *entryp )
1915 {
1916         /* signal user land process if appropriate */
1917         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1918                  (entryp->flags & AIO_DISABLE) == 0 ) {
1919
1920                 /*
1921                  * if group_tag is non zero then make sure this is the last IO request
1922                  * in the group before we signal.
1923                  */
1924                 if ( entryp->group_tag == 0 ||
1925                          (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1926                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1927                                                   (int)entryp->procp, (int)entryp->uaiocbp,
1928                                                   entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1929
1930                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1931                         return;
1932                 }
1933         }
1934
1935         /*
1936          * need to handle case where a process is trying to exit, exec, or close
1937          * and is currently waiting for active aio requests to complete.  If
1938          * AIO_WAITING is set then we need to look to see if there are any
1939          * other requests in the active queue for this process.  If there are
1940          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If
1941          * there are some still active then do nothing - we only want to wakeup
1942          * when all active aio requests for the process are complete.
1943          */
1944         if ( (entryp->flags & AIO_WAITING) != 0 ) {
1945                 int             active_requests;
1946
1947                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1948                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1949
1950                 AIO_LOCK;
1951                 active_requests = aio_active_requests_for_process( entryp->procp );
1952                 //AIO_UNLOCK;
1953                 if ( active_requests < 1 ) {
1954                         /* no active aio requests for this process, continue exiting */
1955                         wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1956
1957                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1958                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1959                 }
1960                 AIO_UNLOCK;
1961                 return;
1962         }
1963
1964         /*
1965          * aio_suspend case when a signal was not requested.  In that scenario we
1966          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1967          * NOTE - the assumption here is that this wakeup call is inexpensive.
1968          * we really only need to do this when an aio_suspend call is pending.
1969          * If we find the wakeup call should be avoided we could mark the
1970          * async IO requests given in the list provided by aio_suspend and only
1971          * call wakeup for them.  If we do mark them we should unmark them after
1972          * the aio_suspend wakes up.
1973          */
1974         AIO_LOCK;
1975         wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1976         AIO_UNLOCK;
1977
1978         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1979                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1980
1981         return;
1982
1983 } /* do_aio_completion */
1984
1985
1986 /*
1987  * aio_last_group_io - checks to see if this is the last unfinished IO request
1988  * for the given group_tag.  Returns TRUE if there are no other active IO
1989  * requests for this group or FALSE if the are active IO requests
1990  * NOTE - AIO_LOCK must be held by caller
1991  */
1992
1993 static boolean_t
1994 aio_last_group_io( aio_workq_entry *entryp )
1995 {
1996         aio_workq_entry                         *my_entryp;
1997
1998         /* look for matches on our queue of active async IO requests */
1999         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
2000                 if ( my_entryp->group_tag == entryp->group_tag )
2001                         return( FALSE );
2002         }
2003
2004         /* look for matches on our queue of asynchronous todo work */
2005         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2006                 if ( my_entryp->group_tag == entryp->group_tag )
2007                         return( FALSE );
2008         }
2009
2010         /* look for matches on our queue of synchronous todo work */
2011         TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2012                 if ( my_entryp->group_tag == entryp->group_tag )
2013                         return( FALSE );
2014         }
2015
2016         return( TRUE );
2017
2018 } /* aio_last_group_io */
2019
2020
2021 /*
2022  * do_aio_read
2023  */
2024 static int
2025 do_aio_read( aio_workq_entry *entryp )
2026 {
2027         struct fileproc                         *fp;
2028         int                                             error;
2029
2030         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2031                 return(error);
2032         if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2033                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2034                 return(EBADF);
2035         }
2036         if ( fp != NULL ) {
2037                 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
2038                                                         entryp->aiocb.aio_buf,
2039                                                         entryp->aiocb.aio_nbytes,
2040                                                         entryp->aiocb.aio_offset, FOF_OFFSET,
2041                                                         &entryp->returnval );
2042                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2043         }
2044         else {
2045                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2046                 error = EBADF;
2047         }
2048
2049         return( error );
2050
2051 } /* do_aio_read */
2052
2053
2054 /*
2055  * do_aio_write
2056  */
2057 static int
2058 do_aio_write( aio_workq_entry *entryp )
2059 {
2060         struct fileproc                 *fp;
2061         int                                             error;
2062
2063         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2064                 return(error);
2065         if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2066                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2067                 return(EBADF);
2068         }
2069         if ( fp != NULL ) {
2070                 /* NB: tell dofilewrite the offset, and to use the proc cred */
2071                 error = dofilewrite( entryp->procp,
2072                                      fp,
2073                                      entryp->aiocb.aio_fildes,
2074                                      entryp->aiocb.aio_buf,
2075                                      entryp->aiocb.aio_nbytes,
2076                                      entryp->aiocb.aio_offset,
2077                                      FOF_OFFSET | FOF_PCRED,
2078                                      &entryp->returnval);
2079
2080                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2081         }
2082         else {
2083                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2084                 error = EBADF;
2085         }
2086
2087         return( error );
2088
2089 } /* do_aio_write */
2090
2091
2092 /*
2093  * aio_active_requests_for_process - return number of active async IO
2094  * requests for the given process.
2095  * NOTE - caller must hold aio lock!
2096  */
2097
2098 static int
2099 aio_active_requests_for_process( struct proc *procp )
2100 {
2101
2102         return( procp->aio_active_count );
2103
2104 } /* aio_active_requests_for_process */
2105
2106
2107 /*
2108  * do_aio_fsync
2109  */
2110 static int
2111 do_aio_fsync( aio_workq_entry *entryp )
2112 {
2113         struct vfs_context      context;
2114         struct vnode            *vp;
2115         struct fileproc         *fp;
2116         int                                     error;
2117
2118         /*
2119          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2120          * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2121          * The following was shamelessly extracted from fsync() implementation.
2122          */
2123
2124         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2125         if ( error == 0 ) {
2126                 if ( (error = vnode_getwithref(vp)) ) {
2127                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2128                         entryp->returnval = -1;
2129                         return(error);
2130                 }
2131                 context.vc_proc = entryp->procp;
2132                 context.vc_ucred = fp->f_fglob->fg_cred;
2133
2134                 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2135
2136                 (void)vnode_put(vp);
2137
2138                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2139         }
2140         if ( error != 0 )
2141                 entryp->returnval = -1;
2142
2143         return( error );
2144
2145 } /* do_aio_fsync */
2146
2147
2148 /*
2149  * is_already_queued - runs through our queues to see if the given
2150  * aiocbp / process is there.  Returns TRUE if there is a match
2151  * on any of our aio queues.
2152  * NOTE - callers must hold aio lock!
2153  */
2154
2155 static boolean_t
2156 is_already_queued(      struct proc *procp,
2157                                         user_addr_t aiocbp )
2158 {
2159         aio_workq_entry                 *entryp;
2160         boolean_t                               result;
2161
2162         result = FALSE;
2163
2164         /* look for matches on our queue of async IO requests that have completed */
2165         TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2166                 if ( aiocbp == entryp->uaiocbp ) {
2167                         result = TRUE;
2168                         goto ExitThisRoutine;
2169                 }
2170         }
2171
2172         /* look for matches on our queue of active async IO requests */
2173         TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2174                 if ( aiocbp == entryp->uaiocbp ) {
2175                         result = TRUE;
2176                         goto ExitThisRoutine;
2177                 }
2178         }
2179
2180         /* look for matches on our queue of asynchronous todo work */
2181         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2182                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2183                         result = TRUE;
2184                         goto ExitThisRoutine;
2185                 }
2186         }
2187
2188         /* look for matches on our queue of synchronous todo work */
2189         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2190                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2191                         result = TRUE;
2192                         goto ExitThisRoutine;
2193                 }
2194         }
2195
2196 ExitThisRoutine:
2197         return( result );
2198
2199 } /* is_already_queued */
2200
2201
2202 /*
2203  * aio initialization
2204  */
2205 __private_extern__ void
2206 aio_init( void )
2207 {
2208         int                     i;
2209
2210         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2211         lck_grp_attr_setstat(aio_lock_grp_attr);
2212         aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2213         aio_lock_attr = lck_attr_alloc_init();
2214         //lck_attr_setdebug(aio_lock_attr);
2215
2216         aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2217
2218         AIO_LOCK;
2219         TAILQ_INIT( &aio_anchor.aio_async_workq );
2220         TAILQ_INIT( &aio_anchor.lio_sync_workq );
2221         aio_anchor.aio_async_workq_count = 0;
2222         aio_anchor.lio_sync_workq_count = 0;
2223         aio_anchor.aio_active_count = 0;
2224         aio_anchor.aio_done_count = 0;
2225         AIO_UNLOCK;
2226
2227         i = sizeof( aio_workq_entry );
2228         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2229
2230         _aio_create_worker_threads( aio_worker_threads );
2231
2232         return;
2233
2234 } /* aio_init */
2235
2236
2237 /*
2238  * aio worker threads created here.
2239  */
2240 __private_extern__ void
2241 _aio_create_worker_threads( int num )
2242 {
2243         int                     i;
2244
2245         /* create some worker threads to handle the async IO requests */
2246         for ( i = 0; i < num; i++ ) {
2247                 thread_t                myThread;
2248
2249                 myThread = kernel_thread( kernel_task, aio_work_thread );
2250                 if ( THREAD_NULL == myThread ) {
2251                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2252                 }
2253         }
2254
2255         return;
2256
2257 } /* _aio_create_worker_threads */
2258
2259 /*
2260  * Return the current activation utask
2261  */
2262 task_t
2263 get_aiotask(void)
2264 {
2265         return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2266 }
2267
2268
2269 /*
2270  * In the case of an aiocb from a
2271  * 32-bit process we need to expand some longs and pointers to the correct
2272  * sizes in order to let downstream code always work on the same type of
2273  * aiocb (in our case that is a user_aiocb)
2274  */
2275 static void
2276 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2277 {
2278         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2279         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2280         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2281         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2282         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2283         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2284
2285         /* special case here.  since we do not know if sigev_value is an */
2286         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2287         /* means if we send this info back to user space we need to remember */
2288         /* sigev_value was not expanded for the 32-bit case.  */
2289         /* NOTE - this does NOT affect us since we don't support sigev_value */
2290         /* yet in the aio context.  */
2291         //LP64
2292         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2293         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2294         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2295                 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2296         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2297                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2298         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2299                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2300 }