bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30
  31
  32 /*
  33  * todo:
  34  *              1) ramesh is looking into how to replace taking a reference on
  35  *                      the user's map (vm_map_reference()) since it is believed that
  36  *                      would not hold the process for us.
  37  *              2) david is looking into a way for us to set the priority of the
  38  *                      worker threads to match that of the user's thread when the
  39  *                      async IO was queued.
  40  */
  41
  42
  43 /*
  44  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  45  */
  46
  47 #include <sys/systm.h>
  48 #include <sys/fcntl.h>
  49 #include <sys/file_internal.h>
  50 #include <sys/filedesc.h>
  51 #include <sys/kernel.h>
  52 #include <sys/vnode_internal.h>
  53 #include <sys/malloc.h>
  54 #include <sys/mount_internal.h>
  55 #include <sys/param.h>
  56 #include <sys/proc_internal.h>
  57 #include <sys/sysctl.h>
  58 #include <sys/unistd.h>
  59 #include <sys/user.h>
  60
  61 #include <sys/aio_kern.h>
  62 #include <sys/sysproto.h>
  63
  64 #include <machine/limits.h>
  65
  66 #include <mach/mach_types.h>
  67 #include <kern/kern_types.h>
  68 #include <kern/zalloc.h>
  69 #include <kern/task.h>
  70 #include <kern/sched_prim.h>
  71
  72 #include <vm/vm_map.h>
  73
  74 #include <sys/kdebug.h>
  75 #define AIO_work_queued                                 1
  76 #define AIO_worker_wake                                 2
  77 #define AIO_completion_sig                              3
  78 #define AIO_completion_cleanup_wait             4
  79 #define AIO_completion_cleanup_wake             5
  80 #define AIO_completion_suspend_wake     6
  81 #define AIO_fsync_delay                                 7
  82 #define AIO_cancel                                              10
  83 #define AIO_cancel_async_workq                  11
  84 #define AIO_cancel_sync_workq                   12
  85 #define AIO_cancel_activeq                              13
  86 #define AIO_cancel_doneq                                14
  87 #define AIO_fsync                                               20
  88 #define AIO_read                                                30
  89 #define AIO_write                                               40
  90 #define AIO_listio                                              50
  91 #define AIO_error                                               60
  92 #define AIO_error_val                                   61
  93 #define AIO_error_activeq                               62
  94 #define AIO_error_workq                                 63
  95 #define AIO_return                                              70
  96 #define AIO_return_val                                  71
  97 #define AIO_return_activeq                              72
  98 #define AIO_return_workq                                73
  99 #define AIO_exec                                                80
 100 #define AIO_exit                                                90
 101 #define AIO_exit_sleep                                  91
 102 #define AIO_close                                               100
 103 #define AIO_close_sleep                                 101
 104 #define AIO_suspend                                             110
 105 #define AIO_suspend_sleep                               111
 106 #define AIO_worker_thread                               120
 107
 108 #if 0
 109 #undef KERNEL_DEBUG
 110 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 111 #endif
 112
 113 /*
 114  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 115  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 116  * (proc.aio_activeq) when one of our worker threads start the IO.
 117  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 118  * when the IO request completes.  The request remains on aio_doneq until
 119  * user process calls aio_return or the process exits, either way that is our
 120  * trigger to release aio resources.
 121  */
 122 struct aio_anchor_cb
 123 {
 124         int                                                                     aio_async_workq_count;  /* entries on aio_async_workq */
 125         int                                                                     lio_sync_workq_count;   /* entries on lio_sync_workq */
 126         int                                                                     aio_active_count;       /* entries on all active queues (proc.aio_activeq) */
 127         int                                                                     aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 128         TAILQ_HEAD( , aio_workq_entry )         aio_async_workq;
 129         TAILQ_HEAD( , aio_workq_entry )         lio_sync_workq;
 130 };
 131 typedef struct aio_anchor_cb aio_anchor_cb;
 132
 133
 134 /*
 135  * Notes on aio sleep / wake channels.
 136  * We currently pick a couple fields within the proc structure that will allow
 137  * us sleep channels that currently do not collide with any other kernel routines.
 138  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 139  */
 140 #define AIO_SUSPEND_SLEEP_CHAN  p_estcpu
 141 #define AIO_CLEANUP_SLEEP_CHAN  p_pctcpu
 142
 143
 144 /*
 145  * aysnc IO locking macros used to protect critical sections.
 146  */
 147 #define AIO_LOCK        lck_mtx_lock(aio_lock)
 148 #define AIO_UNLOCK      lck_mtx_unlock(aio_lock)
 149
 150
 151 /*
 152  *  LOCAL PROTOTYPES
 153  */
 154 static int                      aio_active_requests_for_process( struct proc *procp );
 155 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 156 static int                      aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
 157 static int                      aio_get_all_queues_count( void );
 158 static int                      aio_get_process_count( struct proc *procp );
 159 static aio_workq_entry *  aio_get_some_work( void );
 160 static boolean_t        aio_last_group_io( aio_workq_entry *entryp );
 161 static void                     aio_mark_requests( aio_workq_entry *entryp );
 162 static int                      aio_queue_async_request( struct proc *procp,
 163                                                                                          user_addr_t aiocbp,
 164                                                                                          int kindOfIO );
 165 static int                      aio_validate( aio_workq_entry *entryp );
 166 static void                     aio_work_thread( void );
 167 static int                      do_aio_cancel(  struct proc *p,
 168                                                                         int fd,
 169                                                                         user_addr_t aiocbp,
 170                                                                         boolean_t wait_for_completion,
 171                                                                         boolean_t disable_notification );
 172 static void                     do_aio_completion( aio_workq_entry *entryp );
 173 static int                      do_aio_fsync( aio_workq_entry *entryp );
 174 static int                      do_aio_read( aio_workq_entry *entryp );
 175 static int                      do_aio_write( aio_workq_entry *entryp );
 176 static void             do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 177 static boolean_t        is_already_queued(      struct proc *procp,
 178                                                                                 user_addr_t aiocbp );
 179 static int                      lio_create_async_entry( struct proc *procp,
 180                                                                                          user_addr_t aiocbp,
 181                                                                                          user_addr_t sigp,
 182                                                                                          long group_tag,
 183                                                                                          aio_workq_entry **entrypp );
 184 static int                      lio_create_sync_entry( struct proc *procp,
 185                                                                                         user_addr_t aiocbp,
 186                                                                                         long group_tag,
 187                                                                                         aio_workq_entry **entrypp );
 188
 189
 190 /*
 191  *  EXTERNAL PROTOTYPES
 192  */
 193
 194 /* in ...bsd/kern/sys_generic.c */
 195 extern int                      dofileread( struct proc *p, struct fileproc *fp, int fd,
 196                                                                 user_addr_t bufp, user_size_t nbyte,
 197                                                                 off_t offset, int flags, user_ssize_t *retval );
 198 extern int                      dofilewrite( struct proc *p, struct fileproc *fp, int fd,
 199                                                                  user_addr_t bufp, user_size_t nbyte, off_t offset,
 200                                                                  int flags, user_ssize_t *retval );
 201
 202 /*
 203  * aio external global variables.
 204  */
 205 extern int aio_max_requests;                            /* AIO_MAX - configurable */
 206 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 207 extern int aio_worker_threads;                          /* AIO_THREAD_COUNT - configurable */
 208
 209
 210 /*
 211  * aio static variables.
 212  */
 213 static aio_anchor_cb            aio_anchor;
 214 static lck_mtx_t *              aio_lock;
 215 static lck_grp_t *              aio_lock_grp;
 216 static lck_attr_t *             aio_lock_attr;
 217 static lck_grp_attr_t *         aio_lock_grp_attr;
 218 static struct zone              *aio_workq_zonep;
 219
 220
 221
 222
 223 /*
 224  * aio_cancel - attempt to cancel one or more async IO requests currently
 225  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 226  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 227  * is NULL then all outstanding async IO request for the given file
 228  * descriptor are cancelled (if possible).
 229  */
 230
 231 int
 232 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
 233 {
 234         struct user_aiocb               my_aiocb;
 235         int                                                     result;
 236
 237         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 238                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 239
 240         /* quick check to see if there are any async IO requests queued up */
 241         AIO_LOCK;
 242         result = aio_get_all_queues_count( );
 243         AIO_UNLOCK;
 244         if ( result < 1 ) {
 245                 result = EBADF;
 246                 goto ExitRoutine;
 247         }
 248
 249         *retval = -1;
 250         if ( uap->aiocbp != USER_ADDR_NULL ) {
 251                 if ( !IS_64BIT_PROCESS(p) ) {
 252                         struct aiocb aiocb32;
 253
 254                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
 255                         if ( result == 0 )
 256                                 do_munge_aiocb( &aiocb32, &my_aiocb );
 257                 } else
 258                         result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
 259
 260                 if ( result != 0 ) {
 261                         result = EAGAIN;
 262                         goto ExitRoutine;
 263                 }
 264
 265                 /* NOTE - POSIX standard says a mismatch between the file */
 266                 /* descriptor passed in and the file descriptor embedded in */
 267                 /* the aiocb causes unspecified results.  We return EBADF in */
 268                 /* that situation.  */
 269                 if ( uap->fd != my_aiocb.aio_fildes ) {
 270                         result = EBADF;
 271                         goto ExitRoutine;
 272                 }
 273         }
 274         result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
 275
 276         if ( result != -1 ) {
 277                 *retval = result;
 278                 result = 0;
 279                 goto ExitRoutine;
 280         }
 281
 282         result = EBADF;
 283
 284 ExitRoutine:
 285         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 286                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 287
 288         return( result );
 289
 290 } /* aio_cancel */
 291
 292
 293 /*
 294  * _aio_close - internal function used to clean up async IO requests for
 295  * a file descriptor that is closing.
 296  * THIS MAY BLOCK.
 297  */
 298
 299 __private_extern__ void
 300 _aio_close( struct proc *p, int fd )
 301 {
 302         int                     error, count;
 303
 304         /* quick check to see if there are any async IO requests queued up */
 305         AIO_LOCK;
 306         count = aio_get_all_queues_count( );
 307         AIO_UNLOCK;
 308         if ( count < 1 )
 309                 return;
 310
 311         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 312                           (int)p, fd, 0, 0, 0 );
 313
 314         /* cancel all async IO requests on our todo queues for this file descriptor */
 315         error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
 316         if ( error == AIO_NOTCANCELED ) {
 317                 /*
 318                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 319                  * and file descriptor on the active async IO queue.  Active requests cannot
 320                  * be cancelled so we must wait for them to complete.  We will get a special
 321                  * wake up call on our channel used to sleep for ALL active requests to
 322                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 323                  * when we must wait for all active aio requests.
 324                  */
 325
 326                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 327                                   (int)p, fd, 0, 0, 0 );
 328
 329                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
 330         }
 331
 332         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 333                           (int)p, fd, 0, 0, 0 );
 334
 335         return;
 336
 337 } /* _aio_close */
 338
 339
 340 /*
 341  * aio_error - return the error status associated with the async IO
 342  * request referred to by uap->aiocbp.  The error status is the errno
 343  * value that would be set by the corresponding IO request (read, wrtie,
 344  * fdatasync, or sync).
 345  */
 346
 347 int
 348 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
 349 {
 350         aio_workq_entry                         *entryp;
 351         int                                                     error;
 352
 353         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 354                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 355
 356         AIO_LOCK;
 357
 358         /* quick check to see if there are any async IO requests queued up */
 359         if ( aio_get_all_queues_count( ) < 1 ) {
 360                 error = EINVAL;
 361                 goto ExitRoutine;
 362         }
 363
 364         /* look for a match on our queue of async IO requests that have completed */
 365         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 366                 if ( entryp->uaiocbp == uap->aiocbp ) {
 367                         *retval = entryp->errorval;
 368                         error = 0;
 369                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 370                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 371                         goto ExitRoutine;
 372                 }
 373         }
 374
 375         /* look for a match on our queue of active async IO requests */
 376         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 377                 if ( entryp->uaiocbp == uap->aiocbp ) {
 378                         *retval = EINPROGRESS;
 379                         error = 0;
 380                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 381                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 382                         goto ExitRoutine;
 383                 }
 384         }
 385
 386         /* look for a match on our queue of todo work */
 387         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 388                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 389                         *retval = EINPROGRESS;
 390                         error = 0;
 391                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
 392                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 393                         goto ExitRoutine;
 394                 }
 395         }
 396         error = EINVAL;
 397
 398 ExitRoutine:
 399         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 400                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 401         AIO_UNLOCK;
 402
 403         return( error );
 404
 405 } /* aio_error */
 406
 407
 408 /*
 409  * aio_fsync - asynchronously force all IO operations associated
 410  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 411  * queued at the time of the call to the synchronized completion state.
 412  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 413  * fdatasync() call.
 414  */
 415
 416 int
 417 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
 418 {
 419         int                     error;
 420         int                     fsync_kind;
 421
 422         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 423                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 424
 425         *retval = 0;
 426         /* 0 := O_SYNC for binary backward compatibility with Panther */
 427         if (uap->op == O_SYNC || uap->op == 0)
 428                 fsync_kind = AIO_FSYNC;
 429 #if 0 // we don't support fdatasync() call yet
 430         else if ( uap->op == O_DSYNC )
 431                 fsync_kind = AIO_DSYNC;
 432 #endif
 433         else {
 434                 *retval = -1;
 435                 error = EINVAL;
 436                 goto ExitRoutine;
 437         }
 438
 439         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 440         if ( error != 0 )
 441                 *retval = -1;
 442
 443 ExitRoutine:
 444         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 445                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 446
 447         return( error );
 448
 449 } /* aio_fsync */
 450
 451
 452 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 453  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 454  * (uap->aiocbp->aio_buf).
 455  */
 456
 457 int
 458 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
 459 {
 460         int                     error;
 461
 462         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 463                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 464
 465         *retval = 0;
 466
 467         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 468         if ( error != 0 )
 469                 *retval = -1;
 470
 471         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 472                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 473
 474         return( error );
 475
 476 } /* aio_read */
 477
 478
 479 /*
 480  * aio_return - return the return status associated with the async IO
 481  * request referred to by uap->aiocbp.  The return status is the value
 482  * that would be returned by corresponding IO request (read, wrtie,
 483  * fdatasync, or sync).  This is where we release kernel resources
 484  * held for async IO call associated with the given aiocb pointer.
 485  */
 486
 487 int
 488 aio_return( struct proc *p, struct aio_return_args *uap, user_ssize_t *retval )
 489 {
 490         aio_workq_entry                         *entryp;
 491         int                                                     error;
 492         boolean_t                                       lock_held;
 493
 494         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 495                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 496
 497         AIO_LOCK;
 498         lock_held = TRUE;
 499         *retval = 0;
 500
 501         /* quick check to see if there are any async IO requests queued up */
 502         if ( aio_get_all_queues_count( ) < 1 ) {
 503                 error = EINVAL;
 504                 goto ExitRoutine;
 505         }
 506
 507         /* look for a match on our queue of async IO requests that have completed */
 508         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 509                 if ( entryp->uaiocbp == uap->aiocbp ) {
 510                         TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 511                         aio_anchor.aio_done_count--;
 512                         p->aio_done_count--;
 513
 514                         *retval = entryp->returnval;
 515
 516                         /* we cannot free requests that are still completing */
 517                         if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 518                                 vm_map_t                my_map;
 519
 520                                 my_map = entryp->aio_map;
 521                                 entryp->aio_map = VM_MAP_NULL;
 522                                 AIO_UNLOCK;
 523                                 lock_held = FALSE;
 524                                 aio_free_request( entryp, my_map );
 525                         }
 526                         else
 527                                 /* tell completion code to free this request */
 528                                 entryp->flags |= AIO_DO_FREE;
 529                         error = 0;
 530                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 531                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 532                         goto ExitRoutine;
 533                 }
 534         }
 535
 536         /* look for a match on our queue of active async IO requests */
 537         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 538                 if ( entryp->uaiocbp == uap->aiocbp ) {
 539                         error = EINPROGRESS;
 540                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 541                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 542                         goto ExitRoutine;
 543                 }
 544         }
 545
 546         /* look for a match on our queue of todo work */
 547         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 548                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 549                         error = EINPROGRESS;
 550                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
 551                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 552                         goto ExitRoutine;
 553                 }
 554         }
 555         error = EINVAL;
 556
 557 ExitRoutine:
 558         if ( lock_held )
 559                 AIO_UNLOCK;
 560         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 561                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 562
 563         return( error );
 564
 565 } /* aio_return */
 566
 567
 568 /*
 569  * _aio_exec - internal function used to clean up async IO requests for
 570  * a process that is going away due to exec().  We cancel any async IOs
 571  * we can and wait for those already active.  We also disable signaling
 572  * for cancelled or active aio requests that complete.
 573  * This routine MAY block!
 574  */
 575
 576 __private_extern__ void
 577 _aio_exec( struct proc *p )
 578 {
 579
 580         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 581                           (int)p, 0, 0, 0, 0 );
 582
 583         _aio_exit( p );
 584
 585         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 586                           (int)p, 0, 0, 0, 0 );
 587
 588         return;
 589
 590 } /* _aio_exec */
 591
 592
 593 /*
 594  * _aio_exit - internal function used to clean up async IO requests for
 595  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 596  * we can and wait for those already active.  We also disable signaling
 597  * for cancelled or active aio requests that complete.  This routine MAY block!
 598  */
 599
 600 __private_extern__ void
 601 _aio_exit( struct proc *p )
 602 {
 603         int                                             error, count;
 604         aio_workq_entry                 *entryp;
 605
 606         /* quick check to see if there are any async IO requests queued up */
 607         AIO_LOCK;
 608         count = aio_get_all_queues_count( );
 609         AIO_UNLOCK;
 610         if ( count < 1 ) {
 611                 return;
 612         }
 613
 614         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 615                           (int)p, 0, 0, 0, 0 );
 616
 617         /*
 618          * cancel async IO requests on the todo work queue and wait for those
 619          * already active to complete.
 620          */
 621         error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
 622         if ( error == AIO_NOTCANCELED ) {
 623                 /*
 624                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 625                  * on the active async IO queue.  Active requests cannot be cancelled so we
 626                  * must wait for them to complete.  We will get a special wake up call on
 627                  * our channel used to sleep for ALL active requests to complete.  This sleep
 628                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 629                  * active aio requests.
 630                  */
 631
 632                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 633                                   (int)p, 0, 0, 0, 0 );
 634
 635                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
 636         }
 637
 638         /* release all aio resources used by this process */
 639         AIO_LOCK;
 640         entryp = TAILQ_FIRST( &p->aio_doneq );
 641         while ( entryp != NULL ) {
 642                 aio_workq_entry                 *next_entryp;
 643
 644                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 645                 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 646                 aio_anchor.aio_done_count--;
 647                 p->aio_done_count--;
 648
 649                 /* we cannot free requests that are still completing */
 650                 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 651                         vm_map_t                my_map;
 652
 653                         my_map = entryp->aio_map;
 654                         entryp->aio_map = VM_MAP_NULL;
 655                         AIO_UNLOCK;
 656                         aio_free_request( entryp, my_map );
 657
 658                         /* need to start over since aio_doneq may have been */
 659                         /* changed while we were away.  */
 660                         AIO_LOCK;
 661                         entryp = TAILQ_FIRST( &p->aio_doneq );
 662                         continue;
 663                 }
 664                 else
 665                         /* tell completion code to free this request */
 666                         entryp->flags |= AIO_DO_FREE;
 667                 entryp = next_entryp;
 668         }
 669         AIO_UNLOCK;
 670
 671         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 672                           (int)p, 0, 0, 0, 0 );
 673
 674         return;
 675
 676 } /* _aio_exit */
 677
 678
 679 /*
 680  * do_aio_cancel - cancel async IO requests (if possible).  We get called by
 681  * aio_cancel, close, and at exit.
 682  * There are three modes of operation: 1) cancel all async IOs for a process -
 683  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 684  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 685  * aiocbp.
 686  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 687  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 688  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 689  * were already complete.
 690  * WARNING - do not deference aiocbp in this routine, it may point to user
 691  * land data that has not been copied in (when called from aio_cancel() )
 692  */
 693
 694 static int
 695 do_aio_cancel(  struct proc *p, int fd, user_addr_t aiocbp,
 696                                 boolean_t wait_for_completion, boolean_t disable_notification )
 697 {
 698         aio_workq_entry                 *entryp;
 699         int                                             result;
 700
 701         result = -1;
 702
 703         /* look for a match on our queue of async todo work. */
 704         AIO_LOCK;
 705         entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 706         while ( entryp != NULL ) {
 707                 aio_workq_entry                 *next_entryp;
 708
 709                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 710                 if ( p == entryp->procp ) {
 711                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 712                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 713                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 714                                 /* we found a match so we remove the entry from the */
 715                                 /* todo work queue and place it on the done queue */
 716                                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 717                                 aio_anchor.aio_async_workq_count--;
 718                                 entryp->errorval = ECANCELED;
 719                                 entryp->returnval = -1;
 720                                 if ( disable_notification )
 721                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 722                                 result = AIO_CANCELED;
 723
 724                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
 725                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 726
 727                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 728                                 aio_anchor.aio_done_count++;
 729                                 p->aio_done_count++;
 730                                 entryp->flags |= AIO_COMPLETION;
 731                                 AIO_UNLOCK;
 732
 733                                 /* do completion processing for this request */
 734                                 do_aio_completion( entryp );
 735
 736                                 AIO_LOCK;
 737                                 entryp->flags &= ~AIO_COMPLETION;
 738                                 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
 739                                         vm_map_t                my_map;
 740
 741                                         my_map = entryp->aio_map;
 742                                         entryp->aio_map = VM_MAP_NULL;
 743                                         AIO_UNLOCK;
 744                                         aio_free_request( entryp, my_map );
 745                                 }
 746                                 else
 747                                         AIO_UNLOCK;
 748
 749                                 if ( aiocbp != USER_ADDR_NULL ) {
 750                                         return( result );
 751                                 }
 752
 753                                 /* need to start over since aio_async_workq may have been */
 754                                 /* changed while we were away doing completion processing.  */
 755                                 AIO_LOCK;
 756                                 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 757                                 continue;
 758                         }
 759                 }
 760                 entryp = next_entryp;
 761         } /* while... */
 762
 763         /*
 764          * look for a match on our queue of synchronous todo work.  This will
 765          * be a rare occurrence but could happen if a process is terminated while
 766          * processing a lio_listio call.
 767          */
 768         entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 769         while ( entryp != NULL ) {
 770                 aio_workq_entry                 *next_entryp;
 771
 772                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 773                 if ( p == entryp->procp ) {
 774                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 775                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 776                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 777                                 /* we found a match so we remove the entry from the */
 778                                 /* todo work queue and place it on the done queue */
 779                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 780                                 aio_anchor.lio_sync_workq_count--;
 781                                 entryp->errorval = ECANCELED;
 782                                 entryp->returnval = -1;
 783                                 if ( disable_notification )
 784                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 785                                 result = AIO_CANCELED;
 786
 787                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
 788                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 789
 790                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 791                                 aio_anchor.aio_done_count++;
 792                                 p->aio_done_count++;
 793                                 if ( aiocbp != USER_ADDR_NULL ) {
 794                                         AIO_UNLOCK;
 795                                         return( result );
 796                                 }
 797                         }
 798                 }
 799                 entryp = next_entryp;
 800         } /* while... */
 801
 802         /*
 803          * look for a match on our queue of active async IO requests and
 804          * return AIO_NOTCANCELED result.
 805          */
 806         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 807                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 808                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 809                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 810                         result = AIO_NOTCANCELED;
 811
 812                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
 813                                                   (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 814
 815                         if ( wait_for_completion )
 816                                 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
 817                         if ( disable_notification )
 818                                 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 819                         if ( aiocbp != USER_ADDR_NULL ) {
 820                                 AIO_UNLOCK;
 821                                 return( result );
 822                         }
 823                 }
 824         }
 825
 826         /*
 827          * if we didn't find any matches on the todo or active queues then look for a
 828          * match on our queue of async IO requests that have completed and if found
 829          * return AIO_ALLDONE result.
 830          */
 831         if ( result == -1 ) {
 832                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 833                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 834                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 835                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 836                                 result = AIO_ALLDONE;
 837
 838                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
 839                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 840
 841                                 if ( aiocbp != USER_ADDR_NULL ) {
 842                                         AIO_UNLOCK;
 843                                         return( result );
 844                                 }
 845                         }
 846                 }
 847         }
 848         AIO_UNLOCK;
 849
 850         return( result );
 851
 852 } /* do_aio_cancel */
 853
 854
 855 /*
 856  * aio_suspend - suspend the calling thread until at least one of the async
 857  * IO operations referenced by uap->aiocblist has completed, until a signal
 858  * interrupts the function, or uap->timeoutp time interval (optional) has
 859  * passed.
 860  * Returns 0 if one or more async IOs have completed else -1 and errno is
 861  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
 862  * woke us up.
 863  */
 864
 865 int
 866 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
 867 {
 868         int                                     error;
 869         int                                     i, count;
 870         uint64_t                        abstime;
 871         struct user_timespec ts;
 872         aio_workq_entry         *entryp;
 873         user_addr_t                     *aiocbpp;
 874
 875         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
 876                           (int)p, uap->nent, 0, 0, 0 );
 877
 878         *retval = -1;
 879         abstime = 0;
 880         aiocbpp = NULL;
 881
 882         /* quick check to see if there are any async IO requests queued up */
 883         AIO_LOCK;
 884         count = aio_get_all_queues_count( );
 885         AIO_UNLOCK;
 886         if ( count < 1 ) {
 887                 error = EINVAL;
 888                 goto ExitThisRoutine;
 889         }
 890
 891         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
 892                 error = EINVAL;
 893                 goto ExitThisRoutine;
 894         }
 895
 896         if ( uap->timeoutp != USER_ADDR_NULL ) {
 897                 if ( proc_is64bit(p) ) {
 898                         error = copyin( uap->timeoutp, &ts, sizeof(ts) );
 899                 }
 900                 else {
 901                         struct timespec temp;
 902                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
 903                         if ( error == 0 ) {
 904                                 ts.tv_sec = temp.tv_sec;
 905                                 ts.tv_nsec = temp.tv_nsec;
 906                         }
 907                 }
 908                 if ( error != 0 ) {
 909                         error = EAGAIN;
 910                         goto ExitThisRoutine;
 911                 }
 912
 913                 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
 914                         error = EINVAL;
 915                         goto ExitThisRoutine;
 916                 }
 917
 918                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
 919                                                                          &abstime );
 920                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
 921         }
 922
 923         /* we reserve enough space for largest possible pointer size */
 924         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
 925         if ( aiocbpp == NULL ) {
 926                 error = EAGAIN;
 927                 goto ExitThisRoutine;
 928         }
 929
 930         /* copyin our aiocb pointers from list */
 931         error = copyin( uap->aiocblist, aiocbpp,
 932                                         proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
 933                                                                         : (uap->nent * sizeof(uintptr_t)) );
 934         if ( error != 0 ) {
 935                 error = EAGAIN;
 936                 goto ExitThisRoutine;
 937         }
 938
 939         /* we depend on a list of user_addr_t's so we need to munge and expand */
 940         /* when these pointers came from a 32-bit process */
 941         if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
 942                 /* position to the last entry and work back from there */
 943                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
 944                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
 945                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
 946                         *my_addrp = (user_addr_t) (*my_ptrp);
 947                 }
 948         }
 949
 950         /* check list of aio requests to see if any have completed */
 951         AIO_LOCK;
 952         for ( i = 0; i < uap->nent; i++ ) {
 953                 user_addr_t     aiocbp;
 954
 955                 /* NULL elements are legal so check for 'em */
 956                 aiocbp = *(aiocbpp + i);
 957                 if ( aiocbp == USER_ADDR_NULL )
 958                         continue;
 959
 960                 /* return immediately if any aio request in the list is done */
 961                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 962                         if ( entryp->uaiocbp == aiocbp ) {
 963                                 *retval = 0;
 964                                 error = 0;
 965                                 AIO_UNLOCK;
 966                                 goto ExitThisRoutine;
 967                         }
 968                 }
 969         } /* for ( ; i < uap->nent; ) */
 970
 971         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
 972                           (int)p, uap->nent, 0, 0, 0 );
 973
 974         /*
 975          * wait for an async IO to complete or a signal fires or timeout expires.
 976          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
 977          * interrupts us.  If an async IO completes before a signal fires or our
 978          * timeout expires, we get a wakeup call from aio_work_thread().
 979          */
 980         assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
 981         AIO_UNLOCK;
 982
 983         error = thread_block( THREAD_CONTINUE_NULL );
 984
 985         if ( error == THREAD_AWAKENED ) {
 986                 /* got our wakeup call from aio_work_thread() */
 987                 *retval = 0;
 988                 error = 0;
 989         }
 990         else if ( error == THREAD_TIMED_OUT ) {
 991                 /* our timeout expired */
 992                 error = EAGAIN;
 993         }
 994         else {
 995                 /* we were interrupted */
 996                 error = EINTR;
 997         }
 998
 999 ExitThisRoutine:
1000         if ( aiocbpp != NULL )
1001                 FREE( aiocbpp, M_TEMP );
1002
1003         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1004                           (int)p, uap->nent, error, 0, 0 );
1005
1006         return( error );
1007
1008 } /* aio_suspend */
1009
1010
1011 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1012  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1013  * (uap->aiocbp->aio_buf).
1014  */
1015
1016 int
1017 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1018 {
1019         int                     error;
1020
1021         *retval = 0;
1022
1023         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1024                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1025
1026         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1027         if ( error != 0 )
1028                 *retval = -1;
1029
1030         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1031                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1032
1033         return( error );
1034
1035 } /* aio_write */
1036
1037
1038 /*
1039  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs
1040  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1041  * The caller gets error and return status for each aiocb in the list via aio_error
1042  * and aio_return.  We must keep completed requests until released by the
1043  * aio_return call.
1044  */
1045
1046 int
1047 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1048 {
1049         int                                                     i;
1050         int                                                     call_result;
1051         int                                                     result;
1052         long                                            group_tag;
1053         aio_workq_entry *                       *entryp_listp;
1054         user_addr_t                                     *aiocbpp;
1055
1056         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1057                           (int)p, uap->nent, uap->mode, 0, 0 );
1058
1059         entryp_listp = NULL;
1060         aiocbpp = NULL;
1061         call_result = -1;
1062         *retval = -1;
1063         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1064                 call_result = EINVAL;
1065                 goto ExitRoutine;
1066         }
1067
1068         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1069                 call_result = EINVAL;
1070                 goto ExitRoutine;
1071         }
1072
1073         /*
1074          * we use group_tag to mark IO requests for delayed completion processing
1075          * which means we wait until all IO requests in the group have completed
1076          * before we either return to the caller when mode is LIO_WAIT or signal
1077          * user when mode is LIO_NOWAIT.
1078          */
1079         group_tag = random();
1080
1081         /*
1082          * allocate a list of aio_workq_entry pointers that we will use to queue
1083          * up all our requests at once while holding our lock.
1084          */
1085         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1086         if ( entryp_listp == NULL ) {
1087                 call_result = EAGAIN;
1088                 goto ExitRoutine;
1089         }
1090
1091         /* we reserve enough space for largest possible pointer size */
1092         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1093         if ( aiocbpp == NULL ) {
1094                 call_result = EAGAIN;
1095                 goto ExitRoutine;
1096         }
1097
1098         /* copyin our aiocb pointers from list */
1099         result = copyin( uap->aiocblist, aiocbpp,
1100                                         IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1101                                                                                 : (uap->nent * sizeof(uintptr_t)) );
1102         if ( result != 0 ) {
1103                 call_result = EAGAIN;
1104                 goto ExitRoutine;
1105         }
1106
1107         /* we depend on a list of user_addr_t's so we need to munge and expand */
1108         /* when these pointers came from a 32-bit process */
1109         if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1110                 /* position to the last entry and work back from there */
1111                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1112                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1113                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1114                         *my_addrp = (user_addr_t) (*my_ptrp);
1115                 }
1116         }
1117
1118         /* process list of aio requests */
1119         for ( i = 0; i < uap->nent; i++ ) {
1120                 user_addr_t my_aiocbp;
1121
1122                 *(entryp_listp + i) = NULL;
1123                 my_aiocbp = *(aiocbpp + i);
1124
1125                 /* NULL elements are legal so check for 'em */
1126                 if ( my_aiocbp == USER_ADDR_NULL )
1127                         continue;
1128
1129                 if ( uap->mode == LIO_NOWAIT )
1130                         result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1131                                                                                          group_tag, (entryp_listp + i) );
1132                 else
1133                         result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1134                                                                                         (entryp_listp + i) );
1135
1136                 if ( result != 0 && call_result == -1 )
1137                         call_result = result;
1138         }
1139
1140         /*
1141          * we need to protect this section since we do not want any of these grouped
1142          * IO requests to begin until we have them all on the queue.
1143          */
1144         AIO_LOCK;
1145         for ( i = 0; i < uap->nent; i++ ) {
1146                 aio_workq_entry                         *entryp;
1147
1148                 /* NULL elements are legal so check for 'em */
1149                 entryp = *(entryp_listp + i);
1150                 if ( entryp == NULL )
1151                         continue;
1152
1153                 /* check our aio limits to throttle bad or rude user land behavior */
1154                 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1155                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1156                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1157                         vm_map_t                my_map;
1158
1159                         my_map = entryp->aio_map;
1160                         entryp->aio_map = VM_MAP_NULL;
1161                         if ( call_result == -1 )
1162                                 call_result = EAGAIN;
1163                         AIO_UNLOCK;
1164                         aio_free_request( entryp, my_map );
1165                         AIO_LOCK;
1166                         continue;
1167                 }
1168
1169                 /* place the request on the appropriate queue */
1170                 if ( uap->mode == LIO_NOWAIT ) {
1171                         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1172                         aio_anchor.aio_async_workq_count++;
1173
1174                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1175                                           (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1176                 }
1177                 else {
1178                         TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1179                         aio_anchor.lio_sync_workq_count++;
1180                 }
1181         }
1182
1183         if ( uap->mode == LIO_NOWAIT ) {
1184                 /* caller does not want to wait so we'll fire off a worker thread and return */
1185                 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1186         }
1187         else {
1188                 aio_workq_entry                 *entryp;
1189                 int                                     error;
1190
1191                 /*
1192                  * mode is LIO_WAIT - handle the IO requests now.
1193                  */
1194                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1195                 while ( entryp != NULL ) {
1196                         if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1197
1198                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1199                                 aio_anchor.lio_sync_workq_count--;
1200                                 AIO_UNLOCK;
1201
1202                                 if ( (entryp->flags & AIO_READ) != 0 ) {
1203                                         error = do_aio_read( entryp );
1204                                 }
1205                                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1206                                         error = do_aio_write( entryp );
1207                                 }
1208                                 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1209                                         error = do_aio_fsync( entryp );
1210                                 }
1211                                 else {
1212                                         printf( "%s - unknown aio request - flags 0x%02X \n",
1213                                                         __FUNCTION__, entryp->flags );
1214                                         error = EINVAL;
1215                                 }
1216                                 entryp->errorval = error;
1217                                 if ( error != 0 && call_result == -1 )
1218                                         call_result = EIO;
1219
1220                                 AIO_LOCK;
1221                                 /* we're done with the IO request so move it on the done queue */
1222                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1223                                 aio_anchor.aio_done_count++;
1224                                 p->aio_done_count++;
1225
1226                                 /* need to start over since lio_sync_workq may have been changed while we */
1227                                 /* were away doing the IO.  */
1228                                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1229                                 continue;
1230                         } /* p == entryp->procp */
1231
1232                         entryp = TAILQ_NEXT( entryp, aio_workq_link );
1233         } /* while ( entryp != NULL ) */
1234         } /* uap->mode == LIO_WAIT */
1235         AIO_UNLOCK;
1236
1237         /* call_result == -1 means we had no trouble queueing up requests */
1238         if ( call_result == -1 ) {
1239                 call_result = 0;
1240                 *retval = 0;
1241         }
1242
1243 ExitRoutine:
1244         if ( entryp_listp != NULL )
1245                 FREE( entryp_listp, M_TEMP );
1246         if ( aiocbpp != NULL )
1247                 FREE( aiocbpp, M_TEMP );
1248
1249         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1250                           (int)p, call_result, 0, 0, 0 );
1251
1252         return( call_result );
1253
1254 } /* lio_listio */
1255
1256
1257 /*
1258  * aio worker thread.  this is where all the real work gets done.
1259  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1260  * after new work is queued up.
1261  */
1262
1263 static void
1264 aio_work_thread( void )
1265 {
1266         aio_workq_entry                 *entryp;
1267
1268         for( ;; ) {
1269                 AIO_LOCK;
1270                 entryp = aio_get_some_work();
1271         if ( entryp == NULL ) {
1272                 /*
1273                  * aio worker threads wait for some work to get queued up
1274                  * by aio_queue_async_request.  Once some work gets queued
1275                  * it will wake up one of these worker threads just before
1276                  * returning to our caller in user land.
1277                  */
1278                         assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1279                         AIO_UNLOCK;
1280
1281                         thread_block( (thread_continue_t)aio_work_thread );
1282                         /* NOT REACHED */
1283         }
1284                 else {
1285                         int                     error;
1286                         vm_map_t                currentmap;
1287                         vm_map_t                oldmap = VM_MAP_NULL;
1288                         task_t                  oldaiotask = TASK_NULL;
1289                         struct uthread  *uthreadp = NULL;
1290
1291                         AIO_UNLOCK;
1292
1293                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1294                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1295
1296                         /*
1297                          * Assume the target's address space identity for the duration
1298                          * of the IO.
1299                          */
1300                         currentmap = get_task_map( (current_proc())->task );
1301                         if ( currentmap != entryp->aio_map ) {
1302                                 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1303                                 oldaiotask = uthreadp->uu_aio_task;
1304                                 uthreadp->uu_aio_task = entryp->procp->task;
1305                                 oldmap = vm_map_switch( entryp->aio_map );
1306                         }
1307
1308                         if ( (entryp->flags & AIO_READ) != 0 ) {
1309                                 error = do_aio_read( entryp );
1310                         }
1311                         else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1312                                 error = do_aio_write( entryp );
1313                         }
1314                         else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1315                                 error = do_aio_fsync( entryp );
1316                         }
1317                         else {
1318                                 printf( "%s - unknown aio request - flags 0x%02X \n",
1319                                                 __FUNCTION__, entryp->flags );
1320                                 error = EINVAL;
1321                         }
1322                         entryp->errorval = error;
1323                         if ( currentmap != entryp->aio_map ) {
1324                                 (void) vm_map_switch( oldmap );
1325                                 uthreadp->uu_aio_task = oldaiotask;
1326                         }
1327
1328                         /* we're done with the IO request so pop it off the active queue and */
1329                         /* push it on the done queue */
1330                         AIO_LOCK;
1331                         TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1332                         aio_anchor.aio_active_count--;
1333                         entryp->procp->aio_active_count--;
1334                         TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1335                         aio_anchor.aio_done_count++;
1336                         entryp->procp->aio_done_count++;
1337                         entryp->flags |= AIO_COMPLETION;
1338
1339                         /* remove our reference to the user land map. */
1340                         if ( VM_MAP_NULL != entryp->aio_map ) {
1341                                 vm_map_t                my_map;
1342
1343                                 my_map = entryp->aio_map;
1344                                 entryp->aio_map = VM_MAP_NULL;
1345                                 AIO_UNLOCK;  /* must unlock before calling vm_map_deallocate() */
1346                                 vm_map_deallocate( my_map );
1347                         }
1348                         else {
1349                                 AIO_UNLOCK;
1350                         }
1351
1352                         do_aio_completion( entryp );
1353
1354                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1355                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1356                                                   entryp->returnval, 0 );
1357
1358                         AIO_LOCK;
1359                         entryp->flags &= ~AIO_COMPLETION;
1360                         if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1361                                 vm_map_t                my_map;
1362
1363                                 my_map = entryp->aio_map;
1364                                 entryp->aio_map = VM_MAP_NULL;
1365                                 AIO_UNLOCK;
1366                                 aio_free_request( entryp, my_map );
1367                         }
1368                         else
1369                                 AIO_UNLOCK;
1370                 }
1371         } /* for ( ;; ) */
1372
1373         /* NOT REACHED */
1374
1375 } /* aio_work_thread */
1376
1377
1378 /*
1379  * aio_get_some_work - get the next async IO request that is ready to be executed.
1380  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1381  * IO requests at the time the aio_fsync call came in have completed.
1382  * NOTE - AIO_LOCK must be held by caller
1383  */
1384
1385 static aio_workq_entry *
1386 aio_get_some_work( void )
1387 {
1388         aio_workq_entry                         *entryp;
1389
1390         /* pop some work off the work queue and add to our active queue */
1391         for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1392                   entryp != NULL;
1393                   entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1394
1395                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1396                         /* leave aio_fsync calls on the work queue if there are IO */
1397                         /* requests on the active queue for the same file descriptor. */
1398                         if ( aio_delay_fsync_request( entryp ) ) {
1399
1400                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1401                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1402                                 continue;
1403                         }
1404                 }
1405                 break;
1406         }
1407
1408         if ( entryp != NULL ) {
1409                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1410                 aio_anchor.aio_async_workq_count--;
1411                 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1412                 aio_anchor.aio_active_count++;
1413                 entryp->procp->aio_active_count++;
1414         }
1415
1416         return( entryp );
1417
1418 } /* aio_get_some_work */
1419
1420
1421 /*
1422  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1423  * this time.  Delay will happen when there are any active IOs for the same file
1424  * descriptor that were queued at time the aio_sync call was queued.
1425  * NOTE - AIO_LOCK must be held by caller
1426  */
1427 static boolean_t
1428 aio_delay_fsync_request( aio_workq_entry *entryp )
1429 {
1430         aio_workq_entry                 *my_entryp;
1431
1432         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1433                 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1434                          entryp->uaiocbp == my_entryp->fsyncp &&
1435                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1436                         return( TRUE );
1437                 }
1438         }
1439
1440         return( FALSE );
1441
1442 } /* aio_delay_fsync_request */
1443
1444
1445 /*
1446  * aio_queue_async_request - queue up an async IO request on our work queue then
1447  * wake up one of our worker threads to do the actual work.  We get a reference
1448  * to our caller's user land map in order to keep it around while we are
1449  * processing the request.
1450  */
1451
1452 static int
1453 aio_queue_async_request( struct proc *procp, user_addr_t aiocbp, int kindOfIO )
1454 {
1455         aio_workq_entry                 *entryp;
1456         int                                             result;
1457
1458         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1459         if ( entryp == NULL ) {
1460                 result = EAGAIN;
1461                 goto error_exit;
1462         }
1463         bzero( entryp, sizeof(*entryp) );
1464
1465         /* fill in the rest of the aio_workq_entry */
1466         entryp->procp = procp;
1467         entryp->uaiocbp = aiocbp;
1468         entryp->flags |= kindOfIO;
1469         entryp->aio_map = VM_MAP_NULL;
1470
1471         if ( !IS_64BIT_PROCESS(procp) ) {
1472                 struct aiocb aiocb32;
1473
1474                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1475                 if ( result == 0 )
1476                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1477         } else
1478                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1479
1480         if ( result != 0 ) {
1481                 result = EAGAIN;
1482                 goto error_exit;
1483         }
1484
1485         /* do some more validation on the aiocb and embedded file descriptor */
1486         result = aio_validate( entryp );
1487         if ( result != 0 )
1488                 goto error_exit;
1489
1490         /* get a reference to the user land map in order to keep it around */
1491         entryp->aio_map = get_task_map( procp->task );
1492         vm_map_reference( entryp->aio_map );
1493
1494         AIO_LOCK;
1495
1496         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1497                 AIO_UNLOCK;
1498                 result = EAGAIN;
1499                 goto error_exit;
1500         }
1501
1502         /* check our aio limits to throttle bad or rude user land behavior */
1503         if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1504                  aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1505                 AIO_UNLOCK;
1506                 result = EAGAIN;
1507                 goto error_exit;
1508         }
1509
1510         /*
1511          * aio_fsync calls sync up all async IO requests queued at the time
1512          * the aio_fsync call was made.  So we mark each currently queued async
1513          * IO with a matching file descriptor as must complete before we do the
1514          * fsync.  We set the fsyncp field of each matching async IO
1515          * request with the aiocb pointer passed in on the aio_fsync call to
1516          * know which IOs must complete before we process the aio_fsync call.
1517          */
1518         if ( (kindOfIO & AIO_FSYNC) != 0 )
1519                 aio_mark_requests( entryp );
1520
1521         /* queue up on our aio asynchronous work queue */
1522         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1523         aio_anchor.aio_async_workq_count++;
1524
1525         wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1526         AIO_UNLOCK;
1527
1528         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1529                           (int)procp, (int)aiocbp, 0, 0, 0 );
1530
1531         return( 0 );
1532
1533 error_exit:
1534         if ( entryp != NULL ) {
1535                 /* this entry has not been queued up so no worries about unlocked */
1536                 /* state and aio_map */
1537                 aio_free_request( entryp, entryp->aio_map );
1538         }
1539
1540         return( result );
1541
1542 } /* aio_queue_async_request */
1543
1544
1545 /*
1546  * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1547  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1548  * our caller.  We get a reference to our caller's user land map in order to keep
1549  * it around while we are processing the request.
1550  * lio_listio calls behave differently at completion they do completion notification
1551  * when all async IO requests have completed.  We use group_tag to tag IO requests
1552  * that behave in the delay notification manner.
1553  */
1554
1555 static int
1556 lio_create_async_entry( struct proc *procp, user_addr_t aiocbp,
1557                                                  user_addr_t sigp, long group_tag,
1558                                                  aio_workq_entry **entrypp )
1559 {
1560         aio_workq_entry                         *entryp;
1561         int                                                     result;
1562
1563         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1564         if ( entryp == NULL ) {
1565                 result = EAGAIN;
1566                 goto error_exit;
1567         }
1568         bzero( entryp, sizeof(*entryp) );
1569
1570         /* fill in the rest of the aio_workq_entry */
1571         entryp->procp = procp;
1572         entryp->uaiocbp = aiocbp;
1573         entryp->flags |= AIO_LIO;
1574         entryp->group_tag = group_tag;
1575         entryp->aio_map = VM_MAP_NULL;
1576
1577         if ( !IS_64BIT_PROCESS(procp) ) {
1578                 struct aiocb aiocb32;
1579
1580                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1581                 if ( result == 0 )
1582                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1583         } else
1584                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1585
1586         if ( result != 0 ) {
1587                 result = EAGAIN;
1588                 goto error_exit;
1589         }
1590
1591         /* look for lio_listio LIO_NOP requests and ignore them. */
1592         /* Not really an error, but we need to free our aio_workq_entry.  */
1593         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1594                 result = 0;
1595                 goto error_exit;
1596         }
1597
1598         /* use sigevent passed in to lio_listio for each of our calls, but only */
1599         /* do completion notification after the last request completes. */
1600         if ( sigp != USER_ADDR_NULL ) {
1601                 if ( !IS_64BIT_PROCESS(procp) ) {
1602                         struct sigevent sigevent32;
1603
1604                         result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1605                         if ( result == 0 ) {
1606                                 /* also need to munge aio_sigevent since it contains pointers */
1607                                 /* special case here.  since we do not know if sigev_value is an */
1608                                 /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
1609                                 /* means if we send this info back to user space we need to remember */
1610                                 /* sigev_value was not expanded for the 32-bit case.  */
1611                                 /* NOTE - this does NOT affect us since we don't support sigev_value */
1612                                 /* yet in the aio context.  */
1613                                 //LP64
1614                                 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1615                                 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1616                                 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1617                                         sigevent32.sigev_value.sival_int;
1618                                 entryp->aiocb.aio_sigevent.sigev_notify_function =
1619                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1620                                 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1621                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1622                         }
1623                 } else
1624                         result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1625
1626                 if ( result != 0 ) {
1627                         result = EAGAIN;
1628                         goto error_exit;
1629                 }
1630         }
1631
1632         /* do some more validation on the aiocb and embedded file descriptor */
1633         result = aio_validate( entryp );
1634         if ( result != 0 )
1635                 goto error_exit;
1636
1637         /* get a reference to the user land map in order to keep it around */
1638         entryp->aio_map = get_task_map( procp->task );
1639         vm_map_reference( entryp->aio_map );
1640
1641         *entrypp = entryp;
1642         return( 0 );
1643
1644 error_exit:
1645         if ( entryp != NULL )
1646                 zfree( aio_workq_zonep, entryp );
1647
1648         return( result );
1649
1650 } /* lio_create_async_entry */
1651
1652
1653 /*
1654  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1655  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp
1656  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp
1657  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1658  * NOTE - AIO_LOCK must be held by caller
1659  */
1660
1661 static void
1662 aio_mark_requests( aio_workq_entry *entryp )
1663 {
1664         aio_workq_entry                 *my_entryp;
1665
1666         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1667                 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1668                         my_entryp->fsyncp = entryp->uaiocbp;
1669                 }
1670         }
1671
1672         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1673                 if ( entryp->procp == my_entryp->procp &&
1674                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1675                         my_entryp->fsyncp = entryp->uaiocbp;
1676                 }
1677         }
1678
1679 } /* aio_mark_requests */
1680
1681
1682 /*
1683  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1684  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1685  * our caller.
1686  * lio_listio calls behave differently at completion they do completion notification
1687  * when all async IO requests have completed.  We use group_tag to tag IO requests
1688  * that behave in the delay notification manner.
1689  */
1690
1691 static int
1692 lio_create_sync_entry( struct proc *procp, user_addr_t aiocbp,
1693                                                 long group_tag, aio_workq_entry **entrypp )
1694 {
1695         aio_workq_entry                         *entryp;
1696         int                                                     result;
1697
1698         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1699         if ( entryp == NULL ) {
1700                 result = EAGAIN;
1701                 goto error_exit;
1702         }
1703         bzero( entryp, sizeof(*entryp) );
1704
1705         /* fill in the rest of the aio_workq_entry */
1706         entryp->procp = procp;
1707         entryp->uaiocbp = aiocbp;
1708         entryp->flags |= AIO_LIO;
1709         entryp->group_tag = group_tag;
1710         entryp->aio_map = VM_MAP_NULL;
1711
1712         if ( !IS_64BIT_PROCESS(procp) ) {
1713                 struct aiocb aiocb32;
1714
1715                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1716                 if ( result == 0 )
1717                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1718         } else
1719                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1720
1721         if ( result != 0 ) {
1722                 result = EAGAIN;
1723                 goto error_exit;
1724         }
1725
1726         /* look for lio_listio LIO_NOP requests and ignore them. */
1727         /* Not really an error, but we need to free our aio_workq_entry.  */
1728         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1729                 result = 0;
1730                 goto error_exit;
1731         }
1732
1733         result = aio_validate( entryp );
1734         if ( result != 0 ) {
1735                 goto error_exit;
1736         }
1737
1738         *entrypp = entryp;
1739         return( 0 );
1740
1741 error_exit:
1742         if ( entryp != NULL )
1743                 zfree( aio_workq_zonep, entryp );
1744
1745         return( result );
1746
1747 } /* lio_create_sync_entry */
1748
1749
1750 /*
1751  * aio_free_request - remove our reference on the user land map and
1752  * free the work queue entry resources.
1753  * We are not holding the lock here thus aio_map is passed in and
1754  * zeroed while we did have the lock.
1755  */
1756
1757 static int
1758 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1759 {
1760         /* remove our reference to the user land map. */
1761         if ( VM_MAP_NULL != the_map ) {
1762                 vm_map_deallocate( the_map );
1763         }
1764
1765         zfree( aio_workq_zonep, entryp );
1766
1767         return( 0 );
1768
1769 } /* aio_free_request */
1770
1771
1772 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1773  */
1774
1775 static int
1776 aio_validate( aio_workq_entry *entryp )
1777 {
1778         struct fileproc                                 *fp;
1779         int                                                     flag;
1780         int                                                     result;
1781
1782         result = 0;
1783
1784         if ( (entryp->flags & AIO_LIO) != 0 ) {
1785                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1786                         entryp->flags |= AIO_READ;
1787                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1788                         entryp->flags |= AIO_WRITE;
1789                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1790                         return( 0 );
1791                 else
1792                         return( EINVAL );
1793         }
1794
1795         flag = FREAD;
1796         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1797                 flag = FWRITE;
1798         }
1799
1800         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1801                 // LP64todo - does max value for aio_nbytes need to grow?
1802                 if ( entryp->aiocb.aio_nbytes > INT_MAX         ||
1803                          entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1804                          entryp->aiocb.aio_offset < 0 )
1805                         return( EINVAL );
1806         }
1807
1808         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify
1809          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value,
1810          * sigev_notify_function, and sigev_notify_attributes are ignored.
1811          */
1812         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1813                 int             signum;
1814                 /* make sure we have a valid signal number */
1815                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1816                 if ( signum <= 0 || signum >= NSIG ||
1817                          signum == SIGKILL || signum == SIGSTOP )
1818                         return (EINVAL);
1819         }
1820         else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1821                 return (EINVAL);
1822
1823         /* validate the file descriptor and that the file was opened
1824          * for the appropriate read / write access.
1825          */
1826         proc_fdlock(entryp->procp);
1827
1828         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1829         if ( result == 0 ) {
1830                 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1831                         /* we don't have read or write access */
1832                         result = EBADF;
1833                 }
1834                 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1835                         /* this is not a file */
1836                         result = ESPIPE;
1837                 } else
1838                         fp->f_flags |= FP_AIOISSUED;
1839
1840                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1841         }
1842         else {
1843                 result = EBADF;
1844         }
1845
1846         proc_fdunlock(entryp->procp);
1847
1848         return( result );
1849
1850 } /* aio_validate */
1851
1852
1853 /*
1854  * aio_get_process_count - runs through our queues that hold outstanding
1855  * async IO reqests and totals up number of requests for the given
1856  * process.
1857  * NOTE - caller must hold aio lock!
1858  */
1859
1860 static int
1861 aio_get_process_count( struct proc *procp )
1862 {
1863         aio_workq_entry                         *entryp;
1864         int                                                     count;
1865
1866         /* begin with count of completed async IO requests for this process */
1867         count = procp->aio_done_count;
1868
1869         /* add in count of active async IO requests for this process */
1870         count += procp->aio_active_count;
1871
1872         /* look for matches on our queue of asynchronous todo work */
1873         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1874                 if ( procp == entryp->procp ) {
1875                         count++;
1876                 }
1877         }
1878
1879         /* look for matches on our queue of synchronous todo work */
1880         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1881                 if ( procp == entryp->procp ) {
1882                         count++;
1883                 }
1884         }
1885
1886         return( count );
1887
1888 } /* aio_get_process_count */
1889
1890
1891 /*
1892  * aio_get_all_queues_count - get total number of entries on all aio work queues.
1893  * NOTE - caller must hold aio lock!
1894  */
1895
1896 static int
1897 aio_get_all_queues_count( void )
1898 {
1899         int                                                     count;
1900
1901         count = aio_anchor.aio_async_workq_count;
1902         count += aio_anchor.lio_sync_workq_count;
1903         count += aio_anchor.aio_active_count;
1904         count += aio_anchor.aio_done_count;
1905
1906         return( count );
1907
1908 } /* aio_get_all_queues_count */
1909
1910
1911 /*
1912  * do_aio_completion.  Handle async IO completion.
1913  */
1914
1915 static void
1916 do_aio_completion( aio_workq_entry *entryp )
1917 {
1918         /* signal user land process if appropriate */
1919         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1920                  (entryp->flags & AIO_DISABLE) == 0 ) {
1921
1922                 /*
1923                  * if group_tag is non zero then make sure this is the last IO request
1924                  * in the group before we signal.
1925                  */
1926                 if ( entryp->group_tag == 0 ||
1927                          (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1928                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1929                                                   (int)entryp->procp, (int)entryp->uaiocbp,
1930                                                   entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1931
1932                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1933                         return;
1934                 }
1935         }
1936
1937         /*
1938          * need to handle case where a process is trying to exit, exec, or close
1939          * and is currently waiting for active aio requests to complete.  If
1940          * AIO_WAITING is set then we need to look to see if there are any
1941          * other requests in the active queue for this process.  If there are
1942          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If
1943          * there are some still active then do nothing - we only want to wakeup
1944          * when all active aio requests for the process are complete.
1945          */
1946         if ( (entryp->flags & AIO_WAITING) != 0 ) {
1947                 int             active_requests;
1948
1949                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1950                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1951
1952                 AIO_LOCK;
1953                 active_requests = aio_active_requests_for_process( entryp->procp );
1954                 //AIO_UNLOCK;
1955                 if ( active_requests < 1 ) {
1956                         /* no active aio requests for this process, continue exiting */
1957                         wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1958
1959                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1960                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1961                 }
1962                 AIO_UNLOCK;
1963                 return;
1964         }
1965
1966         /*
1967          * aio_suspend case when a signal was not requested.  In that scenario we
1968          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1969          * NOTE - the assumption here is that this wakeup call is inexpensive.
1970          * we really only need to do this when an aio_suspend call is pending.
1971          * If we find the wakeup call should be avoided we could mark the
1972          * async IO requests given in the list provided by aio_suspend and only
1973          * call wakeup for them.  If we do mark them we should unmark them after
1974          * the aio_suspend wakes up.
1975          */
1976         AIO_LOCK;
1977         wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1978         AIO_UNLOCK;
1979
1980         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1981                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1982
1983         return;
1984
1985 } /* do_aio_completion */
1986
1987
1988 /*
1989  * aio_last_group_io - checks to see if this is the last unfinished IO request
1990  * for the given group_tag.  Returns TRUE if there are no other active IO
1991  * requests for this group or FALSE if the are active IO requests
1992  * NOTE - AIO_LOCK must be held by caller
1993  */
1994
1995 static boolean_t
1996 aio_last_group_io( aio_workq_entry *entryp )
1997 {
1998         aio_workq_entry                         *my_entryp;
1999
2000         /* look for matches on our queue of active async IO requests */
2001         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
2002                 if ( my_entryp->group_tag == entryp->group_tag )
2003                         return( FALSE );
2004         }
2005
2006         /* look for matches on our queue of asynchronous todo work */
2007         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2008                 if ( my_entryp->group_tag == entryp->group_tag )
2009                         return( FALSE );
2010         }
2011
2012         /* look for matches on our queue of synchronous todo work */
2013         TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2014                 if ( my_entryp->group_tag == entryp->group_tag )
2015                         return( FALSE );
2016         }
2017
2018         return( TRUE );
2019
2020 } /* aio_last_group_io */
2021
2022
2023 /*
2024  * do_aio_read
2025  */
2026 static int
2027 do_aio_read( aio_workq_entry *entryp )
2028 {
2029         struct fileproc                         *fp;
2030         int                                             error;
2031
2032         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2033                 return(error);
2034         if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2035                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2036                 return(EBADF);
2037         }
2038         if ( fp != NULL ) {
2039                 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
2040                                                         entryp->aiocb.aio_buf,
2041                                                         entryp->aiocb.aio_nbytes,
2042                                                         entryp->aiocb.aio_offset, FOF_OFFSET,
2043                                                         &entryp->returnval );
2044                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2045         }
2046         else {
2047                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2048                 error = EBADF;
2049         }
2050
2051         return( error );
2052
2053 } /* do_aio_read */
2054
2055
2056 /*
2057  * do_aio_write
2058  */
2059 static int
2060 do_aio_write( aio_workq_entry *entryp )
2061 {
2062         struct fileproc                 *fp;
2063         int                                             error;
2064
2065         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2066                 return(error);
2067         if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2068                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2069                 return(EBADF);
2070         }
2071         if ( fp != NULL ) {
2072                 /* NB: tell dofilewrite the offset, and to use the proc cred */
2073                 error = dofilewrite( entryp->procp,
2074                                      fp,
2075                                      entryp->aiocb.aio_fildes,
2076                                      entryp->aiocb.aio_buf,
2077                                      entryp->aiocb.aio_nbytes,
2078                                      entryp->aiocb.aio_offset,
2079                                      FOF_OFFSET | FOF_PCRED,
2080                                      &entryp->returnval);
2081
2082                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2083         }
2084         else {
2085                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2086                 error = EBADF;
2087         }
2088
2089         return( error );
2090
2091 } /* do_aio_write */
2092
2093
2094 /*
2095  * aio_active_requests_for_process - return number of active async IO
2096  * requests for the given process.
2097  * NOTE - caller must hold aio lock!
2098  */
2099
2100 static int
2101 aio_active_requests_for_process( struct proc *procp )
2102 {
2103
2104         return( procp->aio_active_count );
2105
2106 } /* aio_active_requests_for_process */
2107
2108
2109 /*
2110  * do_aio_fsync
2111  */
2112 static int
2113 do_aio_fsync( aio_workq_entry *entryp )
2114 {
2115         struct vfs_context      context;
2116         struct vnode            *vp;
2117         struct fileproc         *fp;
2118         int                                     error;
2119
2120         /*
2121          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2122          * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2123          * The following was shamelessly extracted from fsync() implementation.
2124          */
2125
2126         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2127         if ( error == 0 ) {
2128                 if ( (error = vnode_getwithref(vp)) ) {
2129                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2130                         entryp->returnval = -1;
2131                         return(error);
2132                 }
2133                 context.vc_proc = entryp->procp;
2134                 context.vc_ucred = fp->f_fglob->fg_cred;
2135
2136                 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2137
2138                 (void)vnode_put(vp);
2139
2140                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2141         }
2142         if ( error != 0 )
2143                 entryp->returnval = -1;
2144
2145         return( error );
2146
2147 } /* do_aio_fsync */
2148
2149
2150 /*
2151  * is_already_queued - runs through our queues to see if the given
2152  * aiocbp / process is there.  Returns TRUE if there is a match
2153  * on any of our aio queues.
2154  * NOTE - callers must hold aio lock!
2155  */
2156
2157 static boolean_t
2158 is_already_queued(      struct proc *procp,
2159                                         user_addr_t aiocbp )
2160 {
2161         aio_workq_entry                 *entryp;
2162         boolean_t                               result;
2163
2164         result = FALSE;
2165
2166         /* look for matches on our queue of async IO requests that have completed */
2167         TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2168                 if ( aiocbp == entryp->uaiocbp ) {
2169                         result = TRUE;
2170                         goto ExitThisRoutine;
2171                 }
2172         }
2173
2174         /* look for matches on our queue of active async IO requests */
2175         TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2176                 if ( aiocbp == entryp->uaiocbp ) {
2177                         result = TRUE;
2178                         goto ExitThisRoutine;
2179                 }
2180         }
2181
2182         /* look for matches on our queue of asynchronous todo work */
2183         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2184                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2185                         result = TRUE;
2186                         goto ExitThisRoutine;
2187                 }
2188         }
2189
2190         /* look for matches on our queue of synchronous todo work */
2191         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2192                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2193                         result = TRUE;
2194                         goto ExitThisRoutine;
2195                 }
2196         }
2197
2198 ExitThisRoutine:
2199         return( result );
2200
2201 } /* is_already_queued */
2202
2203
2204 /*
2205  * aio initialization
2206  */
2207 __private_extern__ void
2208 aio_init( void )
2209 {
2210         int                     i;
2211
2212         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2213         aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2214         aio_lock_attr = lck_attr_alloc_init();
2215
2216         aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2217
2218         AIO_LOCK;
2219         TAILQ_INIT( &aio_anchor.aio_async_workq );
2220         TAILQ_INIT( &aio_anchor.lio_sync_workq );
2221         aio_anchor.aio_async_workq_count = 0;
2222         aio_anchor.lio_sync_workq_count = 0;
2223         aio_anchor.aio_active_count = 0;
2224         aio_anchor.aio_done_count = 0;
2225         AIO_UNLOCK;
2226
2227         i = sizeof( aio_workq_entry );
2228         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2229
2230         _aio_create_worker_threads( aio_worker_threads );
2231
2232         return;
2233
2234 } /* aio_init */
2235
2236
2237 /*
2238  * aio worker threads created here.
2239  */
2240 __private_extern__ void
2241 _aio_create_worker_threads( int num )
2242 {
2243         int                     i;
2244
2245         /* create some worker threads to handle the async IO requests */
2246         for ( i = 0; i < num; i++ ) {
2247                 thread_t                myThread;
2248
2249                 myThread = kernel_thread( kernel_task, aio_work_thread );
2250                 if ( THREAD_NULL == myThread ) {
2251                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2252                 }
2253         }
2254
2255         return;
2256
2257 } /* _aio_create_worker_threads */
2258
2259 /*
2260  * Return the current activation utask
2261  */
2262 task_t
2263 get_aiotask(void)
2264 {
2265         return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2266 }
2267
2268
2269 /*
2270  * In the case of an aiocb from a
2271  * 32-bit process we need to expand some longs and pointers to the correct
2272  * sizes in order to let downstream code always work on the same type of
2273  * aiocb (in our case that is a user_aiocb)
2274  */
2275 static void
2276 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2277 {
2278         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2279         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2280         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2281         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2282         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2283         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2284
2285         /* special case here.  since we do not know if sigev_value is an */
2286         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2287         /* means if we send this info back to user space we need to remember */
2288         /* sigev_value was not expanded for the 32-bit case.  */
2289         /* NOTE - this does NOT affect us since we don't support sigev_value */
2290         /* yet in the aio context.  */
2291         //LP64
2292         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2293         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2294         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2295                 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2296         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2297                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2298         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2299                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2300 }