bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23
  24 /*
  25  * todo:
  26  *              1) ramesh is looking into how to replace taking a reference on
  27  *                      the user's map (vm_map_reference()) since it is believed that
  28  *                      would not hold the process for us.
  29  *              2) david is looking into a way for us to set the priority of the
  30  *                      worker threads to match that of the user's thread when the
  31  *                      async IO was queued.
  32  */
  33
  34
  35 /*
  36  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  37  */
  38
  39 #include <sys/systm.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/file_internal.h>
  42 #include <sys/filedesc.h>
  43 #include <sys/kernel.h>
  44 #include <sys/vnode_internal.h>
  45 #include <sys/malloc.h>
  46 #include <sys/mount_internal.h>
  47 #include <sys/param.h>
  48 #include <sys/proc_internal.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/unistd.h>
  51 #include <sys/user.h>
  52
  53 #include <sys/aio_kern.h>
  54 #include <sys/sysproto.h>
  55
  56 #include <machine/limits.h>
  57
  58 #include <mach/mach_types.h>
  59 #include <kern/kern_types.h>
  60 #include <kern/zalloc.h>
  61 #include <kern/task.h>
  62 #include <kern/sched_prim.h>
  63
  64 #include <vm/vm_map.h>
  65
  66 #include <sys/kdebug.h>
  67 #define AIO_work_queued                                 1
  68 #define AIO_worker_wake                                 2
  69 #define AIO_completion_sig                              3
  70 #define AIO_completion_cleanup_wait             4
  71 #define AIO_completion_cleanup_wake             5
  72 #define AIO_completion_suspend_wake     6
  73 #define AIO_fsync_delay                                 7
  74 #define AIO_cancel                                              10
  75 #define AIO_cancel_async_workq                  11
  76 #define AIO_cancel_sync_workq                   12
  77 #define AIO_cancel_activeq                              13
  78 #define AIO_cancel_doneq                                14
  79 #define AIO_fsync                                               20
  80 #define AIO_read                                                30
  81 #define AIO_write                                               40
  82 #define AIO_listio                                              50
  83 #define AIO_error                                               60
  84 #define AIO_error_val                                   61
  85 #define AIO_error_activeq                               62
  86 #define AIO_error_workq                                 63
  87 #define AIO_return                                              70
  88 #define AIO_return_val                                  71
  89 #define AIO_return_activeq                              72
  90 #define AIO_return_workq                                73
  91 #define AIO_exec                                                80
  92 #define AIO_exit                                                90
  93 #define AIO_exit_sleep                                  91
  94 #define AIO_close                                               100
  95 #define AIO_close_sleep                                 101
  96 #define AIO_suspend                                             110
  97 #define AIO_suspend_sleep                               111
  98 #define AIO_worker_thread                               120
  99
 100 #if 0
 101 #undef KERNEL_DEBUG
 102 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 103 #endif
 104
 105 /*
 106  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 107  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 108  * (proc.aio_activeq) when one of our worker threads start the IO.
 109  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 110  * when the IO request completes.  The request remains on aio_doneq until
 111  * user process calls aio_return or the process exits, either way that is our
 112  * trigger to release aio resources.
 113  */
 114 struct aio_anchor_cb
 115 {
 116         int                                                                     aio_async_workq_count;  /* entries on aio_async_workq */
 117         int                                                                     lio_sync_workq_count;   /* entries on lio_sync_workq */
 118         int                                                                     aio_active_count;       /* entries on all active queues (proc.aio_activeq) */
 119         int                                                                     aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 120         TAILQ_HEAD( , aio_workq_entry )         aio_async_workq;
 121         TAILQ_HEAD( , aio_workq_entry )         lio_sync_workq;
 122 };
 123 typedef struct aio_anchor_cb aio_anchor_cb;
 124
 125
 126 /*
 127  * Notes on aio sleep / wake channels.
 128  * We currently pick a couple fields within the proc structure that will allow
 129  * us sleep channels that currently do not collide with any other kernel routines.
 130  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 131  */
 132 #define AIO_SUSPEND_SLEEP_CHAN  p_estcpu
 133 #define AIO_CLEANUP_SLEEP_CHAN  p_pctcpu
 134
 135
 136 /*
 137  * aysnc IO locking macros used to protect critical sections.
 138  */
 139 #define AIO_LOCK        lck_mtx_lock(aio_lock)
 140 #define AIO_UNLOCK      lck_mtx_unlock(aio_lock)
 141
 142
 143 /*
 144  *  LOCAL PROTOTYPES
 145  */
 146 static int                      aio_active_requests_for_process( struct proc *procp );
 147 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 148 static int                      aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
 149 static int                      aio_get_all_queues_count( void );
 150 static int                      aio_get_process_count( struct proc *procp );
 151 static aio_workq_entry *  aio_get_some_work( void );
 152 static boolean_t        aio_last_group_io( aio_workq_entry *entryp );
 153 static void                     aio_mark_requests( aio_workq_entry *entryp );
 154 static int                      aio_queue_async_request( struct proc *procp,
 155                                                                                          user_addr_t aiocbp,
 156                                                                                          int kindOfIO );
 157 static int                      aio_validate( aio_workq_entry *entryp );
 158 static void                     aio_work_thread( void );
 159 static int                      do_aio_cancel(  struct proc *p,
 160                                                                         int fd,
 161                                                                         user_addr_t aiocbp,
 162                                                                         boolean_t wait_for_completion,
 163                                                                         boolean_t disable_notification );
 164 static void                     do_aio_completion( aio_workq_entry *entryp );
 165 static int                      do_aio_fsync( aio_workq_entry *entryp );
 166 static int                      do_aio_read( aio_workq_entry *entryp );
 167 static int                      do_aio_write( aio_workq_entry *entryp );
 168 static void             do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 169 static boolean_t        is_already_queued(      struct proc *procp,
 170                                                                                 user_addr_t aiocbp );
 171 static int                      lio_create_async_entry( struct proc *procp,
 172                                                                                          user_addr_t aiocbp,
 173                                                                                          user_addr_t sigp,
 174                                                                                          long group_tag,
 175                                                                                          aio_workq_entry **entrypp );
 176 static int                      lio_create_sync_entry( struct proc *procp,
 177                                                                                         user_addr_t aiocbp,
 178                                                                                         long group_tag,
 179                                                                                         aio_workq_entry **entrypp );
 180
 181
 182 /*
 183  *  EXTERNAL PROTOTYPES
 184  */
 185
 186 /* in ...bsd/kern/sys_generic.c */
 187 extern int                      dofileread( struct proc *p, struct fileproc *fp, int fd,
 188                                                                 user_addr_t bufp, user_size_t nbyte,
 189                                                                 off_t offset, int flags, user_ssize_t *retval );
 190 extern int                      dofilewrite( struct proc *p, struct fileproc *fp, int fd,
 191                                                                  user_addr_t bufp, user_size_t nbyte, off_t offset,
 192                                                                  int flags, user_ssize_t *retval );
 193
 194 /*
 195  * aio external global variables.
 196  */
 197 extern int aio_max_requests;                            /* AIO_MAX - configurable */
 198 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 199 extern int aio_worker_threads;                          /* AIO_THREAD_COUNT - configurable */
 200
 201
 202 /*
 203  * aio static variables.
 204  */
 205 static aio_anchor_cb            aio_anchor;
 206 static lck_mtx_t *              aio_lock;
 207 static lck_grp_t *              aio_lock_grp;
 208 static lck_attr_t *             aio_lock_attr;
 209 static lck_grp_attr_t *         aio_lock_grp_attr;
 210 static struct zone              *aio_workq_zonep;
 211
 212
 213
 214
 215 /*
 216  * aio_cancel - attempt to cancel one or more async IO requests currently
 217  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 218  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 219  * is NULL then all outstanding async IO request for the given file
 220  * descriptor are cancelled (if possible).
 221  */
 222
 223 int
 224 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
 225 {
 226         struct user_aiocb               my_aiocb;
 227         int                                                     result;
 228
 229         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 230                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 231
 232         /* quick check to see if there are any async IO requests queued up */
 233         AIO_LOCK;
 234         result = aio_get_all_queues_count( );
 235         AIO_UNLOCK;
 236         if ( result < 1 ) {
 237                 result = EBADF;
 238                 goto ExitRoutine;
 239         }
 240
 241         *retval = -1;
 242         if ( uap->aiocbp != USER_ADDR_NULL ) {
 243                 if ( !IS_64BIT_PROCESS(p) ) {
 244                         struct aiocb aiocb32;
 245
 246                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
 247                         if ( result == 0 )
 248                                 do_munge_aiocb( &aiocb32, &my_aiocb );
 249                 } else
 250                         result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
 251
 252                 if ( result != 0 ) {
 253                         result = EAGAIN;
 254                         goto ExitRoutine;
 255                 }
 256
 257                 /* NOTE - POSIX standard says a mismatch between the file */
 258                 /* descriptor passed in and the file descriptor embedded in */
 259                 /* the aiocb causes unspecified results.  We return EBADF in */
 260                 /* that situation.  */
 261                 if ( uap->fd != my_aiocb.aio_fildes ) {
 262                         result = EBADF;
 263                         goto ExitRoutine;
 264                 }
 265         }
 266         result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
 267
 268         if ( result != -1 ) {
 269                 *retval = result;
 270                 result = 0;
 271                 goto ExitRoutine;
 272         }
 273
 274         result = EBADF;
 275
 276 ExitRoutine:
 277         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 278                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 279
 280         return( result );
 281
 282 } /* aio_cancel */
 283
 284
 285 /*
 286  * _aio_close - internal function used to clean up async IO requests for
 287  * a file descriptor that is closing.
 288  * THIS MAY BLOCK.
 289  */
 290
 291 __private_extern__ void
 292 _aio_close( struct proc *p, int fd )
 293 {
 294         int                     error, count;
 295
 296         /* quick check to see if there are any async IO requests queued up */
 297         AIO_LOCK;
 298         count = aio_get_all_queues_count( );
 299         AIO_UNLOCK;
 300         if ( count < 1 )
 301                 return;
 302
 303         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 304                           (int)p, fd, 0, 0, 0 );
 305
 306         /* cancel all async IO requests on our todo queues for this file descriptor */
 307         error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
 308         if ( error == AIO_NOTCANCELED ) {
 309                 /*
 310                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 311                  * and file descriptor on the active async IO queue.  Active requests cannot
 312                  * be cancelled so we must wait for them to complete.  We will get a special
 313                  * wake up call on our channel used to sleep for ALL active requests to
 314                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 315                  * when we must wait for all active aio requests.
 316                  */
 317
 318                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 319                                   (int)p, fd, 0, 0, 0 );
 320
 321                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
 322         }
 323
 324         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 325                           (int)p, fd, 0, 0, 0 );
 326
 327         return;
 328
 329 } /* _aio_close */
 330
 331
 332 /*
 333  * aio_error - return the error status associated with the async IO
 334  * request referred to by uap->aiocbp.  The error status is the errno
 335  * value that would be set by the corresponding IO request (read, wrtie,
 336  * fdatasync, or sync).
 337  */
 338
 339 int
 340 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
 341 {
 342         aio_workq_entry                         *entryp;
 343         int                                                     error;
 344
 345         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 346                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 347
 348         AIO_LOCK;
 349
 350         /* quick check to see if there are any async IO requests queued up */
 351         if ( aio_get_all_queues_count( ) < 1 ) {
 352                 error = EINVAL;
 353                 goto ExitRoutine;
 354         }
 355
 356         /* look for a match on our queue of async IO requests that have completed */
 357         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 358                 if ( entryp->uaiocbp == uap->aiocbp ) {
 359                         *retval = entryp->errorval;
 360                         error = 0;
 361                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 362                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 363                         goto ExitRoutine;
 364                 }
 365         }
 366
 367         /* look for a match on our queue of active async IO requests */
 368         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 369                 if ( entryp->uaiocbp == uap->aiocbp ) {
 370                         *retval = EINPROGRESS;
 371                         error = 0;
 372                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 373                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 374                         goto ExitRoutine;
 375                 }
 376         }
 377
 378         /* look for a match on our queue of todo work */
 379         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 380                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 381                         *retval = EINPROGRESS;
 382                         error = 0;
 383                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
 384                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 385                         goto ExitRoutine;
 386                 }
 387         }
 388         error = EINVAL;
 389
 390 ExitRoutine:
 391         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 392                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 393         AIO_UNLOCK;
 394
 395         return( error );
 396
 397 } /* aio_error */
 398
 399
 400 /*
 401  * aio_fsync - asynchronously force all IO operations associated
 402  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 403  * queued at the time of the call to the synchronized completion state.
 404  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 405  * fdatasync() call.
 406  */
 407
 408 int
 409 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
 410 {
 411         int                     error;
 412         int                     fsync_kind;
 413
 414         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 415                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 416
 417         *retval = 0;
 418         /* 0 := O_SYNC for binary backward compatibility with Panther */
 419         if (uap->op == O_SYNC || uap->op == 0)
 420                 fsync_kind = AIO_FSYNC;
 421 #if 0 // we don't support fdatasync() call yet
 422         else if ( uap->op == O_DSYNC )
 423                 fsync_kind = AIO_DSYNC;
 424 #endif
 425         else {
 426                 *retval = -1;
 427                 error = EINVAL;
 428                 goto ExitRoutine;
 429         }
 430
 431         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 432         if ( error != 0 )
 433                 *retval = -1;
 434
 435 ExitRoutine:
 436         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 437                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 438
 439         return( error );
 440
 441 } /* aio_fsync */
 442
 443
 444 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 445  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 446  * (uap->aiocbp->aio_buf).
 447  */
 448
 449 int
 450 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
 451 {
 452         int                     error;
 453
 454         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 455                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 456
 457         *retval = 0;
 458
 459         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 460         if ( error != 0 )
 461                 *retval = -1;
 462
 463         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 464                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 465
 466         return( error );
 467
 468 } /* aio_read */
 469
 470
 471 /*
 472  * aio_return - return the return status associated with the async IO
 473  * request referred to by uap->aiocbp.  The return status is the value
 474  * that would be returned by corresponding IO request (read, wrtie,
 475  * fdatasync, or sync).  This is where we release kernel resources
 476  * held for async IO call associated with the given aiocb pointer.
 477  */
 478
 479 int
 480 aio_return( struct proc *p, struct aio_return_args *uap, user_ssize_t *retval )
 481 {
 482         aio_workq_entry                         *entryp;
 483         int                                                     error;
 484         boolean_t                                       lock_held;
 485
 486         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 487                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 488
 489         AIO_LOCK;
 490         lock_held = TRUE;
 491         *retval = 0;
 492
 493         /* quick check to see if there are any async IO requests queued up */
 494         if ( aio_get_all_queues_count( ) < 1 ) {
 495                 error = EINVAL;
 496                 goto ExitRoutine;
 497         }
 498
 499         /* look for a match on our queue of async IO requests that have completed */
 500         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 501                 if ( entryp->uaiocbp == uap->aiocbp ) {
 502                         TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 503                         aio_anchor.aio_done_count--;
 504                         p->aio_done_count--;
 505
 506                         *retval = entryp->returnval;
 507
 508                         /* we cannot free requests that are still completing */
 509                         if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 510                                 vm_map_t                my_map;
 511
 512                                 my_map = entryp->aio_map;
 513                                 entryp->aio_map = VM_MAP_NULL;
 514                                 AIO_UNLOCK;
 515                                 lock_held = FALSE;
 516                                 aio_free_request( entryp, my_map );
 517                         }
 518                         else
 519                                 /* tell completion code to free this request */
 520                                 entryp->flags |= AIO_DO_FREE;
 521                         error = 0;
 522                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 523                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 524                         goto ExitRoutine;
 525                 }
 526         }
 527
 528         /* look for a match on our queue of active async IO requests */
 529         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 530                 if ( entryp->uaiocbp == uap->aiocbp ) {
 531                         error = EINPROGRESS;
 532                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 533                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 534                         goto ExitRoutine;
 535                 }
 536         }
 537
 538         /* look for a match on our queue of todo work */
 539         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 540                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 541                         error = EINPROGRESS;
 542                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
 543                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 544                         goto ExitRoutine;
 545                 }
 546         }
 547         error = EINVAL;
 548
 549 ExitRoutine:
 550         if ( lock_held )
 551                 AIO_UNLOCK;
 552         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 553                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 554
 555         return( error );
 556
 557 } /* aio_return */
 558
 559
 560 /*
 561  * _aio_exec - internal function used to clean up async IO requests for
 562  * a process that is going away due to exec().  We cancel any async IOs
 563  * we can and wait for those already active.  We also disable signaling
 564  * for cancelled or active aio requests that complete.
 565  * This routine MAY block!
 566  */
 567
 568 __private_extern__ void
 569 _aio_exec( struct proc *p )
 570 {
 571
 572         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 573                           (int)p, 0, 0, 0, 0 );
 574
 575         _aio_exit( p );
 576
 577         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 578                           (int)p, 0, 0, 0, 0 );
 579
 580         return;
 581
 582 } /* _aio_exec */
 583
 584
 585 /*
 586  * _aio_exit - internal function used to clean up async IO requests for
 587  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 588  * we can and wait for those already active.  We also disable signaling
 589  * for cancelled or active aio requests that complete.  This routine MAY block!
 590  */
 591
 592 __private_extern__ void
 593 _aio_exit( struct proc *p )
 594 {
 595         int                                             error, count;
 596         aio_workq_entry                 *entryp;
 597
 598         /* quick check to see if there are any async IO requests queued up */
 599         AIO_LOCK;
 600         count = aio_get_all_queues_count( );
 601         AIO_UNLOCK;
 602         if ( count < 1 ) {
 603                 return;
 604         }
 605
 606         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 607                           (int)p, 0, 0, 0, 0 );
 608
 609         /*
 610          * cancel async IO requests on the todo work queue and wait for those
 611          * already active to complete.
 612          */
 613         error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
 614         if ( error == AIO_NOTCANCELED ) {
 615                 /*
 616                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 617                  * on the active async IO queue.  Active requests cannot be cancelled so we
 618                  * must wait for them to complete.  We will get a special wake up call on
 619                  * our channel used to sleep for ALL active requests to complete.  This sleep
 620                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 621                  * active aio requests.
 622                  */
 623
 624                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 625                                   (int)p, 0, 0, 0, 0 );
 626
 627                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
 628         }
 629
 630         /* release all aio resources used by this process */
 631         AIO_LOCK;
 632         entryp = TAILQ_FIRST( &p->aio_doneq );
 633         while ( entryp != NULL ) {
 634                 aio_workq_entry                 *next_entryp;
 635
 636                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 637                 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 638                 aio_anchor.aio_done_count--;
 639                 p->aio_done_count--;
 640
 641                 /* we cannot free requests that are still completing */
 642                 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 643                         vm_map_t                my_map;
 644
 645                         my_map = entryp->aio_map;
 646                         entryp->aio_map = VM_MAP_NULL;
 647                         AIO_UNLOCK;
 648                         aio_free_request( entryp, my_map );
 649
 650                         /* need to start over since aio_doneq may have been */
 651                         /* changed while we were away.  */
 652                         AIO_LOCK;
 653                         entryp = TAILQ_FIRST( &p->aio_doneq );
 654                         continue;
 655                 }
 656                 else
 657                         /* tell completion code to free this request */
 658                         entryp->flags |= AIO_DO_FREE;
 659                 entryp = next_entryp;
 660         }
 661         AIO_UNLOCK;
 662
 663         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 664                           (int)p, 0, 0, 0, 0 );
 665
 666         return;
 667
 668 } /* _aio_exit */
 669
 670
 671 /*
 672  * do_aio_cancel - cancel async IO requests (if possible).  We get called by
 673  * aio_cancel, close, and at exit.
 674  * There are three modes of operation: 1) cancel all async IOs for a process -
 675  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 676  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 677  * aiocbp.
 678  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 679  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 680  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 681  * were already complete.
 682  * WARNING - do not deference aiocbp in this routine, it may point to user
 683  * land data that has not been copied in (when called from aio_cancel() )
 684  */
 685
 686 static int
 687 do_aio_cancel(  struct proc *p, int fd, user_addr_t aiocbp,
 688                                 boolean_t wait_for_completion, boolean_t disable_notification )
 689 {
 690         aio_workq_entry                 *entryp;
 691         int                                             result;
 692
 693         result = -1;
 694
 695         /* look for a match on our queue of async todo work. */
 696         AIO_LOCK;
 697         entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 698         while ( entryp != NULL ) {
 699                 aio_workq_entry                 *next_entryp;
 700
 701                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 702                 if ( p == entryp->procp ) {
 703                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 704                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 705                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 706                                 /* we found a match so we remove the entry from the */
 707                                 /* todo work queue and place it on the done queue */
 708                                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 709                                 aio_anchor.aio_async_workq_count--;
 710                                 entryp->errorval = ECANCELED;
 711                                 entryp->returnval = -1;
 712                                 if ( disable_notification )
 713                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 714                                 result = AIO_CANCELED;
 715
 716                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
 717                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 718
 719                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 720                                 aio_anchor.aio_done_count++;
 721                                 p->aio_done_count++;
 722                                 entryp->flags |= AIO_COMPLETION;
 723                                 AIO_UNLOCK;
 724
 725                                 /* do completion processing for this request */
 726                                 do_aio_completion( entryp );
 727
 728                                 AIO_LOCK;
 729                                 entryp->flags &= ~AIO_COMPLETION;
 730                                 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
 731                                         vm_map_t                my_map;
 732
 733                                         my_map = entryp->aio_map;
 734                                         entryp->aio_map = VM_MAP_NULL;
 735                                         AIO_UNLOCK;
 736                                         aio_free_request( entryp, my_map );
 737                                 }
 738                                 else
 739                                         AIO_UNLOCK;
 740
 741                                 if ( aiocbp != USER_ADDR_NULL ) {
 742                                         return( result );
 743                                 }
 744
 745                                 /* need to start over since aio_async_workq may have been */
 746                                 /* changed while we were away doing completion processing.  */
 747                                 AIO_LOCK;
 748                                 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 749                                 continue;
 750                         }
 751                 }
 752                 entryp = next_entryp;
 753         } /* while... */
 754
 755         /*
 756          * look for a match on our queue of synchronous todo work.  This will
 757          * be a rare occurrence but could happen if a process is terminated while
 758          * processing a lio_listio call.
 759          */
 760         entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 761         while ( entryp != NULL ) {
 762                 aio_workq_entry                 *next_entryp;
 763
 764                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 765                 if ( p == entryp->procp ) {
 766                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 767                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 768                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 769                                 /* we found a match so we remove the entry from the */
 770                                 /* todo work queue and place it on the done queue */
 771                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 772                                 aio_anchor.lio_sync_workq_count--;
 773                                 entryp->errorval = ECANCELED;
 774                                 entryp->returnval = -1;
 775                                 if ( disable_notification )
 776                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 777                                 result = AIO_CANCELED;
 778
 779                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
 780                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 781
 782                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 783                                 aio_anchor.aio_done_count++;
 784                                 p->aio_done_count++;
 785                                 if ( aiocbp != USER_ADDR_NULL ) {
 786                                         AIO_UNLOCK;
 787                                         return( result );
 788                                 }
 789                         }
 790                 }
 791                 entryp = next_entryp;
 792         } /* while... */
 793
 794         /*
 795          * look for a match on our queue of active async IO requests and
 796          * return AIO_NOTCANCELED result.
 797          */
 798         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 799                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 800                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 801                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 802                         result = AIO_NOTCANCELED;
 803
 804                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
 805                                                   (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 806
 807                         if ( wait_for_completion )
 808                                 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
 809                         if ( disable_notification )
 810                                 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 811                         if ( aiocbp != USER_ADDR_NULL ) {
 812                                 AIO_UNLOCK;
 813                                 return( result );
 814                         }
 815                 }
 816         }
 817
 818         /*
 819          * if we didn't find any matches on the todo or active queues then look for a
 820          * match on our queue of async IO requests that have completed and if found
 821          * return AIO_ALLDONE result.
 822          */
 823         if ( result == -1 ) {
 824                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 825                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 826                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 827                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 828                                 result = AIO_ALLDONE;
 829
 830                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
 831                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 832
 833                                 if ( aiocbp != USER_ADDR_NULL ) {
 834                                         AIO_UNLOCK;
 835                                         return( result );
 836                                 }
 837                         }
 838                 }
 839         }
 840         AIO_UNLOCK;
 841
 842         return( result );
 843
 844 } /* do_aio_cancel */
 845
 846
 847 /*
 848  * aio_suspend - suspend the calling thread until at least one of the async
 849  * IO operations referenced by uap->aiocblist has completed, until a signal
 850  * interrupts the function, or uap->timeoutp time interval (optional) has
 851  * passed.
 852  * Returns 0 if one or more async IOs have completed else -1 and errno is
 853  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
 854  * woke us up.
 855  */
 856
 857 int
 858 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
 859 {
 860         int                                     error;
 861         int                                     i, count;
 862         uint64_t                        abstime;
 863         struct user_timespec ts;
 864         aio_workq_entry         *entryp;
 865         user_addr_t                     *aiocbpp;
 866
 867         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
 868                           (int)p, uap->nent, 0, 0, 0 );
 869
 870         *retval = -1;
 871         abstime = 0;
 872         aiocbpp = NULL;
 873
 874         /* quick check to see if there are any async IO requests queued up */
 875         AIO_LOCK;
 876         count = aio_get_all_queues_count( );
 877         AIO_UNLOCK;
 878         if ( count < 1 ) {
 879                 error = EINVAL;
 880                 goto ExitThisRoutine;
 881         }
 882
 883         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
 884                 error = EINVAL;
 885                 goto ExitThisRoutine;
 886         }
 887
 888         if ( uap->timeoutp != USER_ADDR_NULL ) {
 889                 if ( proc_is64bit(p) ) {
 890                         error = copyin( uap->timeoutp, &ts, sizeof(ts) );
 891                 }
 892                 else {
 893                         struct timespec temp;
 894                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
 895                         if ( error == 0 ) {
 896                                 ts.tv_sec = temp.tv_sec;
 897                                 ts.tv_nsec = temp.tv_nsec;
 898                         }
 899                 }
 900                 if ( error != 0 ) {
 901                         error = EAGAIN;
 902                         goto ExitThisRoutine;
 903                 }
 904
 905                 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
 906                         error = EINVAL;
 907                         goto ExitThisRoutine;
 908                 }
 909
 910                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
 911                                                                          &abstime );
 912                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
 913         }
 914
 915         /* we reserve enough space for largest possible pointer size */
 916         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
 917         if ( aiocbpp == NULL ) {
 918                 error = EAGAIN;
 919                 goto ExitThisRoutine;
 920         }
 921
 922         /* copyin our aiocb pointers from list */
 923         error = copyin( uap->aiocblist, aiocbpp,
 924                                         proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
 925                                                                         : (uap->nent * sizeof(uintptr_t)) );
 926         if ( error != 0 ) {
 927                 error = EAGAIN;
 928                 goto ExitThisRoutine;
 929         }
 930
 931         /* we depend on a list of user_addr_t's so we need to munge and expand */
 932         /* when these pointers came from a 32-bit process */
 933         if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
 934                 /* position to the last entry and work back from there */
 935                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
 936                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
 937                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
 938                         *my_addrp = (user_addr_t) (*my_ptrp);
 939                 }
 940         }
 941
 942         /* check list of aio requests to see if any have completed */
 943         AIO_LOCK;
 944         for ( i = 0; i < uap->nent; i++ ) {
 945                 user_addr_t     aiocbp;
 946
 947                 /* NULL elements are legal so check for 'em */
 948                 aiocbp = *(aiocbpp + i);
 949                 if ( aiocbp == USER_ADDR_NULL )
 950                         continue;
 951
 952                 /* return immediately if any aio request in the list is done */
 953                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 954                         if ( entryp->uaiocbp == aiocbp ) {
 955                                 *retval = 0;
 956                                 error = 0;
 957                                 AIO_UNLOCK;
 958                                 goto ExitThisRoutine;
 959                         }
 960                 }
 961         } /* for ( ; i < uap->nent; ) */
 962
 963         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
 964                           (int)p, uap->nent, 0, 0, 0 );
 965
 966         /*
 967          * wait for an async IO to complete or a signal fires or timeout expires.
 968          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
 969          * interrupts us.  If an async IO completes before a signal fires or our
 970          * timeout expires, we get a wakeup call from aio_work_thread().
 971          */
 972         assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
 973         AIO_UNLOCK;
 974
 975         error = thread_block( THREAD_CONTINUE_NULL );
 976
 977         if ( error == THREAD_AWAKENED ) {
 978                 /* got our wakeup call from aio_work_thread() */
 979                 *retval = 0;
 980                 error = 0;
 981         }
 982         else if ( error == THREAD_TIMED_OUT ) {
 983                 /* our timeout expired */
 984                 error = EAGAIN;
 985         }
 986         else {
 987                 /* we were interrupted */
 988                 error = EINTR;
 989         }
 990
 991 ExitThisRoutine:
 992         if ( aiocbpp != NULL )
 993                 FREE( aiocbpp, M_TEMP );
 994
 995         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
 996                           (int)p, uap->nent, error, 0, 0 );
 997
 998         return( error );
 999
1000 } /* aio_suspend */
1001
1002
1003 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1004  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1005  * (uap->aiocbp->aio_buf).
1006  */
1007
1008 int
1009 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1010 {
1011         int                     error;
1012
1013         *retval = 0;
1014
1015         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1016                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1017
1018         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1019         if ( error != 0 )
1020                 *retval = -1;
1021
1022         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1023                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1024
1025         return( error );
1026
1027 } /* aio_write */
1028
1029
1030 /*
1031  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs
1032  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1033  * The caller gets error and return status for each aiocb in the list via aio_error
1034  * and aio_return.  We must keep completed requests until released by the
1035  * aio_return call.
1036  */
1037
1038 int
1039 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1040 {
1041         int                                                     i;
1042         int                                                     call_result;
1043         int                                                     result;
1044         long                                            group_tag;
1045         aio_workq_entry *                       *entryp_listp;
1046         user_addr_t                                     *aiocbpp;
1047
1048         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1049                           (int)p, uap->nent, uap->mode, 0, 0 );
1050
1051         entryp_listp = NULL;
1052         aiocbpp = NULL;
1053         call_result = -1;
1054         *retval = -1;
1055         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1056                 call_result = EINVAL;
1057                 goto ExitRoutine;
1058         }
1059
1060         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1061                 call_result = EINVAL;
1062                 goto ExitRoutine;
1063         }
1064
1065         /*
1066          * we use group_tag to mark IO requests for delayed completion processing
1067          * which means we wait until all IO requests in the group have completed
1068          * before we either return to the caller when mode is LIO_WAIT or signal
1069          * user when mode is LIO_NOWAIT.
1070          */
1071         group_tag = random();
1072
1073         /*
1074          * allocate a list of aio_workq_entry pointers that we will use to queue
1075          * up all our requests at once while holding our lock.
1076          */
1077         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1078         if ( entryp_listp == NULL ) {
1079                 call_result = EAGAIN;
1080                 goto ExitRoutine;
1081         }
1082
1083         /* we reserve enough space for largest possible pointer size */
1084         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1085         if ( aiocbpp == NULL ) {
1086                 call_result = EAGAIN;
1087                 goto ExitRoutine;
1088         }
1089
1090         /* copyin our aiocb pointers from list */
1091         result = copyin( uap->aiocblist, aiocbpp,
1092                                         IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1093                                                                                 : (uap->nent * sizeof(uintptr_t)) );
1094         if ( result != 0 ) {
1095                 call_result = EAGAIN;
1096                 goto ExitRoutine;
1097         }
1098
1099         /* we depend on a list of user_addr_t's so we need to munge and expand */
1100         /* when these pointers came from a 32-bit process */
1101         if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1102                 /* position to the last entry and work back from there */
1103                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1104                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1105                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1106                         *my_addrp = (user_addr_t) (*my_ptrp);
1107                 }
1108         }
1109
1110         /* process list of aio requests */
1111         for ( i = 0; i < uap->nent; i++ ) {
1112                 user_addr_t my_aiocbp;
1113
1114                 *(entryp_listp + i) = NULL;
1115                 my_aiocbp = *(aiocbpp + i);
1116
1117                 /* NULL elements are legal so check for 'em */
1118                 if ( my_aiocbp == USER_ADDR_NULL )
1119                         continue;
1120
1121                 if ( uap->mode == LIO_NOWAIT )
1122                         result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1123                                                                                          group_tag, (entryp_listp + i) );
1124                 else
1125                         result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1126                                                                                         (entryp_listp + i) );
1127
1128                 if ( result != 0 && call_result == -1 )
1129                         call_result = result;
1130         }
1131
1132         /*
1133          * we need to protect this section since we do not want any of these grouped
1134          * IO requests to begin until we have them all on the queue.
1135          */
1136         AIO_LOCK;
1137         for ( i = 0; i < uap->nent; i++ ) {
1138                 aio_workq_entry                         *entryp;
1139
1140                 /* NULL elements are legal so check for 'em */
1141                 entryp = *(entryp_listp + i);
1142                 if ( entryp == NULL )
1143                         continue;
1144
1145                 /* check our aio limits to throttle bad or rude user land behavior */
1146                 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1147                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1148                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1149                         vm_map_t                my_map;
1150
1151                         my_map = entryp->aio_map;
1152                         entryp->aio_map = VM_MAP_NULL;
1153                         if ( call_result == -1 )
1154                                 call_result = EAGAIN;
1155                         AIO_UNLOCK;
1156                         aio_free_request( entryp, my_map );
1157                         AIO_LOCK;
1158                         continue;
1159                 }
1160
1161                 /* place the request on the appropriate queue */
1162                 if ( uap->mode == LIO_NOWAIT ) {
1163                         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1164                         aio_anchor.aio_async_workq_count++;
1165
1166                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1167                                           (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1168                 }
1169                 else {
1170                         TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1171                         aio_anchor.lio_sync_workq_count++;
1172                 }
1173         }
1174
1175         if ( uap->mode == LIO_NOWAIT ) {
1176                 /* caller does not want to wait so we'll fire off a worker thread and return */
1177                 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1178         }
1179         else {
1180                 aio_workq_entry                 *entryp;
1181                 int                                     error;
1182
1183                 /*
1184                  * mode is LIO_WAIT - handle the IO requests now.
1185                  */
1186                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1187                 while ( entryp != NULL ) {
1188                         if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1189
1190                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1191                                 aio_anchor.lio_sync_workq_count--;
1192                                 AIO_UNLOCK;
1193
1194                                 if ( (entryp->flags & AIO_READ) != 0 ) {
1195                                         error = do_aio_read( entryp );
1196                                 }
1197                                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1198                                         error = do_aio_write( entryp );
1199                                 }
1200                                 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1201                                         error = do_aio_fsync( entryp );
1202                                 }
1203                                 else {
1204                                         printf( "%s - unknown aio request - flags 0x%02X \n",
1205                                                         __FUNCTION__, entryp->flags );
1206                                         error = EINVAL;
1207                                 }
1208                                 entryp->errorval = error;
1209                                 if ( error != 0 && call_result == -1 )
1210                                         call_result = EIO;
1211
1212                                 AIO_LOCK;
1213                                 /* we're done with the IO request so move it on the done queue */
1214                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1215                                 aio_anchor.aio_done_count++;
1216                                 p->aio_done_count++;
1217
1218                                 /* need to start over since lio_sync_workq may have been changed while we */
1219                                 /* were away doing the IO.  */
1220                                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1221                                 continue;
1222                         } /* p == entryp->procp */
1223
1224                         entryp = TAILQ_NEXT( entryp, aio_workq_link );
1225         } /* while ( entryp != NULL ) */
1226         } /* uap->mode == LIO_WAIT */
1227         AIO_UNLOCK;
1228
1229         /* call_result == -1 means we had no trouble queueing up requests */
1230         if ( call_result == -1 ) {
1231                 call_result = 0;
1232                 *retval = 0;
1233         }
1234
1235 ExitRoutine:
1236         if ( entryp_listp != NULL )
1237                 FREE( entryp_listp, M_TEMP );
1238         if ( aiocbpp != NULL )
1239                 FREE( aiocbpp, M_TEMP );
1240
1241         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1242                           (int)p, call_result, 0, 0, 0 );
1243
1244         return( call_result );
1245
1246 } /* lio_listio */
1247
1248
1249 /*
1250  * aio worker thread.  this is where all the real work gets done.
1251  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1252  * after new work is queued up.
1253  */
1254
1255 static void
1256 aio_work_thread( void )
1257 {
1258         aio_workq_entry                 *entryp;
1259
1260         for( ;; ) {
1261                 AIO_LOCK;
1262                 entryp = aio_get_some_work();
1263         if ( entryp == NULL ) {
1264                 /*
1265                  * aio worker threads wait for some work to get queued up
1266                  * by aio_queue_async_request.  Once some work gets queued
1267                  * it will wake up one of these worker threads just before
1268                  * returning to our caller in user land.
1269                  */
1270                         assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1271                         AIO_UNLOCK;
1272
1273                         thread_block( (thread_continue_t)aio_work_thread );
1274                         /* NOT REACHED */
1275         }
1276                 else {
1277                         int                     error;
1278                         vm_map_t                currentmap;
1279                         vm_map_t                oldmap = VM_MAP_NULL;
1280                         task_t                  oldaiotask = TASK_NULL;
1281                         struct uthread  *uthreadp = NULL;
1282
1283                         AIO_UNLOCK;
1284
1285                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1286                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1287
1288                         /*
1289                          * Assume the target's address space identity for the duration
1290                          * of the IO.
1291                          */
1292                         currentmap = get_task_map( (current_proc())->task );
1293                         if ( currentmap != entryp->aio_map ) {
1294                                 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1295                                 oldaiotask = uthreadp->uu_aio_task;
1296                                 uthreadp->uu_aio_task = entryp->procp->task;
1297                                 oldmap = vm_map_switch( entryp->aio_map );
1298                         }
1299
1300                         if ( (entryp->flags & AIO_READ) != 0 ) {
1301                                 error = do_aio_read( entryp );
1302                         }
1303                         else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1304                                 error = do_aio_write( entryp );
1305                         }
1306                         else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1307                                 error = do_aio_fsync( entryp );
1308                         }
1309                         else {
1310                                 printf( "%s - unknown aio request - flags 0x%02X \n",
1311                                                 __FUNCTION__, entryp->flags );
1312                                 error = EINVAL;
1313                         }
1314                         entryp->errorval = error;
1315                         if ( currentmap != entryp->aio_map ) {
1316                                 (void) vm_map_switch( oldmap );
1317                                 uthreadp->uu_aio_task = oldaiotask;
1318                         }
1319
1320                         /* we're done with the IO request so pop it off the active queue and */
1321                         /* push it on the done queue */
1322                         AIO_LOCK;
1323                         TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1324                         aio_anchor.aio_active_count--;
1325                         entryp->procp->aio_active_count--;
1326                         TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1327                         aio_anchor.aio_done_count++;
1328                         entryp->procp->aio_done_count++;
1329                         entryp->flags |= AIO_COMPLETION;
1330
1331                         /* remove our reference to the user land map. */
1332                         if ( VM_MAP_NULL != entryp->aio_map ) {
1333                                 vm_map_t                my_map;
1334
1335                                 my_map = entryp->aio_map;
1336                                 entryp->aio_map = VM_MAP_NULL;
1337                                 AIO_UNLOCK;  /* must unlock before calling vm_map_deallocate() */
1338                                 vm_map_deallocate( my_map );
1339                         }
1340                         else {
1341                                 AIO_UNLOCK;
1342                         }
1343
1344                         do_aio_completion( entryp );
1345
1346                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1347                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1348                                                   entryp->returnval, 0 );
1349
1350                         AIO_LOCK;
1351                         entryp->flags &= ~AIO_COMPLETION;
1352                         if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1353                                 vm_map_t                my_map;
1354
1355                                 my_map = entryp->aio_map;
1356                                 entryp->aio_map = VM_MAP_NULL;
1357                                 AIO_UNLOCK;
1358                                 aio_free_request( entryp, my_map );
1359                         }
1360                         else
1361                                 AIO_UNLOCK;
1362                 }
1363         } /* for ( ;; ) */
1364
1365         /* NOT REACHED */
1366
1367 } /* aio_work_thread */
1368
1369
1370 /*
1371  * aio_get_some_work - get the next async IO request that is ready to be executed.
1372  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1373  * IO requests at the time the aio_fsync call came in have completed.
1374  * NOTE - AIO_LOCK must be held by caller
1375  */
1376
1377 static aio_workq_entry *
1378 aio_get_some_work( void )
1379 {
1380         aio_workq_entry                         *entryp;
1381
1382         /* pop some work off the work queue and add to our active queue */
1383         for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1384                   entryp != NULL;
1385                   entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1386
1387                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1388                         /* leave aio_fsync calls on the work queue if there are IO */
1389                         /* requests on the active queue for the same file descriptor. */
1390                         if ( aio_delay_fsync_request( entryp ) ) {
1391
1392                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1393                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1394                                 continue;
1395                         }
1396                 }
1397                 break;
1398         }
1399
1400         if ( entryp != NULL ) {
1401                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1402                 aio_anchor.aio_async_workq_count--;
1403                 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1404                 aio_anchor.aio_active_count++;
1405                 entryp->procp->aio_active_count++;
1406         }
1407
1408         return( entryp );
1409
1410 } /* aio_get_some_work */
1411
1412
1413 /*
1414  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1415  * this time.  Delay will happen when there are any active IOs for the same file
1416  * descriptor that were queued at time the aio_sync call was queued.
1417  * NOTE - AIO_LOCK must be held by caller
1418  */
1419 static boolean_t
1420 aio_delay_fsync_request( aio_workq_entry *entryp )
1421 {
1422         aio_workq_entry                 *my_entryp;
1423
1424         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1425                 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1426                          entryp->uaiocbp == my_entryp->fsyncp &&
1427                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1428                         return( TRUE );
1429                 }
1430         }
1431
1432         return( FALSE );
1433
1434 } /* aio_delay_fsync_request */
1435
1436
1437 /*
1438  * aio_queue_async_request - queue up an async IO request on our work queue then
1439  * wake up one of our worker threads to do the actual work.  We get a reference
1440  * to our caller's user land map in order to keep it around while we are
1441  * processing the request.
1442  */
1443
1444 static int
1445 aio_queue_async_request( struct proc *procp, user_addr_t aiocbp, int kindOfIO )
1446 {
1447         aio_workq_entry                 *entryp;
1448         int                                             result;
1449
1450         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1451         if ( entryp == NULL ) {
1452                 result = EAGAIN;
1453                 goto error_exit;
1454         }
1455         bzero( entryp, sizeof(*entryp) );
1456
1457         /* fill in the rest of the aio_workq_entry */
1458         entryp->procp = procp;
1459         entryp->uaiocbp = aiocbp;
1460         entryp->flags |= kindOfIO;
1461         entryp->aio_map = VM_MAP_NULL;
1462
1463         if ( !IS_64BIT_PROCESS(procp) ) {
1464                 struct aiocb aiocb32;
1465
1466                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1467                 if ( result == 0 )
1468                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1469         } else
1470                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1471
1472         if ( result != 0 ) {
1473                 result = EAGAIN;
1474                 goto error_exit;
1475         }
1476
1477         /* do some more validation on the aiocb and embedded file descriptor */
1478         result = aio_validate( entryp );
1479         if ( result != 0 )
1480                 goto error_exit;
1481
1482         /* get a reference to the user land map in order to keep it around */
1483         entryp->aio_map = get_task_map( procp->task );
1484         vm_map_reference( entryp->aio_map );
1485
1486         AIO_LOCK;
1487
1488         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1489                 AIO_UNLOCK;
1490                 result = EAGAIN;
1491                 goto error_exit;
1492         }
1493
1494         /* check our aio limits to throttle bad or rude user land behavior */
1495         if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1496                  aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1497                 AIO_UNLOCK;
1498                 result = EAGAIN;
1499                 goto error_exit;
1500         }
1501
1502         /*
1503          * aio_fsync calls sync up all async IO requests queued at the time
1504          * the aio_fsync call was made.  So we mark each currently queued async
1505          * IO with a matching file descriptor as must complete before we do the
1506          * fsync.  We set the fsyncp field of each matching async IO
1507          * request with the aiocb pointer passed in on the aio_fsync call to
1508          * know which IOs must complete before we process the aio_fsync call.
1509          */
1510         if ( (kindOfIO & AIO_FSYNC) != 0 )
1511                 aio_mark_requests( entryp );
1512
1513         /* queue up on our aio asynchronous work queue */
1514         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1515         aio_anchor.aio_async_workq_count++;
1516
1517         wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1518         AIO_UNLOCK;
1519
1520         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1521                           (int)procp, (int)aiocbp, 0, 0, 0 );
1522
1523         return( 0 );
1524
1525 error_exit:
1526         if ( entryp != NULL ) {
1527                 /* this entry has not been queued up so no worries about unlocked */
1528                 /* state and aio_map */
1529                 aio_free_request( entryp, entryp->aio_map );
1530         }
1531
1532         return( result );
1533
1534 } /* aio_queue_async_request */
1535
1536
1537 /*
1538  * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1539  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1540  * our caller.  We get a reference to our caller's user land map in order to keep
1541  * it around while we are processing the request.
1542  * lio_listio calls behave differently at completion they do completion notification
1543  * when all async IO requests have completed.  We use group_tag to tag IO requests
1544  * that behave in the delay notification manner.
1545  */
1546
1547 static int
1548 lio_create_async_entry( struct proc *procp, user_addr_t aiocbp,
1549                                                  user_addr_t sigp, long group_tag,
1550                                                  aio_workq_entry **entrypp )
1551 {
1552         aio_workq_entry                         *entryp;
1553         int                                                     result;
1554
1555         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1556         if ( entryp == NULL ) {
1557                 result = EAGAIN;
1558                 goto error_exit;
1559         }
1560         bzero( entryp, sizeof(*entryp) );
1561
1562         /* fill in the rest of the aio_workq_entry */
1563         entryp->procp = procp;
1564         entryp->uaiocbp = aiocbp;
1565         entryp->flags |= AIO_LIO;
1566         entryp->group_tag = group_tag;
1567         entryp->aio_map = VM_MAP_NULL;
1568
1569         if ( !IS_64BIT_PROCESS(procp) ) {
1570                 struct aiocb aiocb32;
1571
1572                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1573                 if ( result == 0 )
1574                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1575         } else
1576                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1577
1578         if ( result != 0 ) {
1579                 result = EAGAIN;
1580                 goto error_exit;
1581         }
1582
1583         /* look for lio_listio LIO_NOP requests and ignore them. */
1584         /* Not really an error, but we need to free our aio_workq_entry.  */
1585         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1586                 result = 0;
1587                 goto error_exit;
1588         }
1589
1590         /* use sigevent passed in to lio_listio for each of our calls, but only */
1591         /* do completion notification after the last request completes. */
1592         if ( sigp != USER_ADDR_NULL ) {
1593                 if ( !IS_64BIT_PROCESS(procp) ) {
1594                         struct sigevent sigevent32;
1595
1596                         result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1597                         if ( result == 0 ) {
1598                                 /* also need to munge aio_sigevent since it contains pointers */
1599                                 /* special case here.  since we do not know if sigev_value is an */
1600                                 /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
1601                                 /* means if we send this info back to user space we need to remember */
1602                                 /* sigev_value was not expanded for the 32-bit case.  */
1603                                 /* NOTE - this does NOT affect us since we don't support sigev_value */
1604                                 /* yet in the aio context.  */
1605                                 //LP64
1606                                 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1607                                 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1608                                 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1609                                         sigevent32.sigev_value.sival_int;
1610                                 entryp->aiocb.aio_sigevent.sigev_notify_function =
1611                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1612                                 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1613                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1614                         }
1615                 } else
1616                         result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1617
1618                 if ( result != 0 ) {
1619                         result = EAGAIN;
1620                         goto error_exit;
1621                 }
1622         }
1623
1624         /* do some more validation on the aiocb and embedded file descriptor */
1625         result = aio_validate( entryp );
1626         if ( result != 0 )
1627                 goto error_exit;
1628
1629         /* get a reference to the user land map in order to keep it around */
1630         entryp->aio_map = get_task_map( procp->task );
1631         vm_map_reference( entryp->aio_map );
1632
1633         *entrypp = entryp;
1634         return( 0 );
1635
1636 error_exit:
1637         if ( entryp != NULL )
1638                 zfree( aio_workq_zonep, entryp );
1639
1640         return( result );
1641
1642 } /* lio_create_async_entry */
1643
1644
1645 /*
1646  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1647  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp
1648  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp
1649  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1650  * NOTE - AIO_LOCK must be held by caller
1651  */
1652
1653 static void
1654 aio_mark_requests( aio_workq_entry *entryp )
1655 {
1656         aio_workq_entry                 *my_entryp;
1657
1658         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1659                 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1660                         my_entryp->fsyncp = entryp->uaiocbp;
1661                 }
1662         }
1663
1664         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1665                 if ( entryp->procp == my_entryp->procp &&
1666                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1667                         my_entryp->fsyncp = entryp->uaiocbp;
1668                 }
1669         }
1670
1671 } /* aio_mark_requests */
1672
1673
1674 /*
1675  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1676  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1677  * our caller.
1678  * lio_listio calls behave differently at completion they do completion notification
1679  * when all async IO requests have completed.  We use group_tag to tag IO requests
1680  * that behave in the delay notification manner.
1681  */
1682
1683 static int
1684 lio_create_sync_entry( struct proc *procp, user_addr_t aiocbp,
1685                                                 long group_tag, aio_workq_entry **entrypp )
1686 {
1687         aio_workq_entry                         *entryp;
1688         int                                                     result;
1689
1690         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1691         if ( entryp == NULL ) {
1692                 result = EAGAIN;
1693                 goto error_exit;
1694         }
1695         bzero( entryp, sizeof(*entryp) );
1696
1697         /* fill in the rest of the aio_workq_entry */
1698         entryp->procp = procp;
1699         entryp->uaiocbp = aiocbp;
1700         entryp->flags |= AIO_LIO;
1701         entryp->group_tag = group_tag;
1702         entryp->aio_map = VM_MAP_NULL;
1703
1704         if ( !IS_64BIT_PROCESS(procp) ) {
1705                 struct aiocb aiocb32;
1706
1707                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1708                 if ( result == 0 )
1709                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1710         } else
1711                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1712
1713         if ( result != 0 ) {
1714                 result = EAGAIN;
1715                 goto error_exit;
1716         }
1717
1718         /* look for lio_listio LIO_NOP requests and ignore them. */
1719         /* Not really an error, but we need to free our aio_workq_entry.  */
1720         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1721                 result = 0;
1722                 goto error_exit;
1723         }
1724
1725         result = aio_validate( entryp );
1726         if ( result != 0 ) {
1727                 goto error_exit;
1728         }
1729
1730         *entrypp = entryp;
1731         return( 0 );
1732
1733 error_exit:
1734         if ( entryp != NULL )
1735                 zfree( aio_workq_zonep, entryp );
1736
1737         return( result );
1738
1739 } /* lio_create_sync_entry */
1740
1741
1742 /*
1743  * aio_free_request - remove our reference on the user land map and
1744  * free the work queue entry resources.
1745  * We are not holding the lock here thus aio_map is passed in and
1746  * zeroed while we did have the lock.
1747  */
1748
1749 static int
1750 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1751 {
1752         /* remove our reference to the user land map. */
1753         if ( VM_MAP_NULL != the_map ) {
1754                 vm_map_deallocate( the_map );
1755         }
1756
1757         zfree( aio_workq_zonep, entryp );
1758
1759         return( 0 );
1760
1761 } /* aio_free_request */
1762
1763
1764 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1765  */
1766
1767 static int
1768 aio_validate( aio_workq_entry *entryp )
1769 {
1770         struct fileproc                                 *fp;
1771         int                                                     flag;
1772         int                                                     result;
1773
1774         result = 0;
1775
1776         if ( (entryp->flags & AIO_LIO) != 0 ) {
1777                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1778                         entryp->flags |= AIO_READ;
1779                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1780                         entryp->flags |= AIO_WRITE;
1781                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1782                         return( 0 );
1783                 else
1784                         return( EINVAL );
1785         }
1786
1787         flag = FREAD;
1788         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1789                 flag = FWRITE;
1790         }
1791
1792         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1793                 // LP64todo - does max value for aio_nbytes need to grow?
1794                 if ( entryp->aiocb.aio_nbytes > INT_MAX         ||
1795                          entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1796                          entryp->aiocb.aio_offset < 0 )
1797                         return( EINVAL );
1798         }
1799
1800         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify
1801          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value,
1802          * sigev_notify_function, and sigev_notify_attributes are ignored.
1803          */
1804         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1805                 int             signum;
1806                 /* make sure we have a valid signal number */
1807                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1808                 if ( signum <= 0 || signum >= NSIG ||
1809                          signum == SIGKILL || signum == SIGSTOP )
1810                         return (EINVAL);
1811         }
1812         else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1813                 return (EINVAL);
1814
1815         /* validate the file descriptor and that the file was opened
1816          * for the appropriate read / write access.
1817          */
1818         proc_fdlock(entryp->procp);
1819
1820         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1821         if ( result == 0 ) {
1822                 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1823                         /* we don't have read or write access */
1824                         result = EBADF;
1825                 }
1826                 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1827                         /* this is not a file */
1828                         result = ESPIPE;
1829                 } else
1830                         fp->f_flags |= FP_AIOISSUED;
1831
1832                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1833         }
1834         else {
1835                 result = EBADF;
1836         }
1837
1838         proc_fdunlock(entryp->procp);
1839
1840         return( result );
1841
1842 } /* aio_validate */
1843
1844
1845 /*
1846  * aio_get_process_count - runs through our queues that hold outstanding
1847  * async IO reqests and totals up number of requests for the given
1848  * process.
1849  * NOTE - caller must hold aio lock!
1850  */
1851
1852 static int
1853 aio_get_process_count( struct proc *procp )
1854 {
1855         aio_workq_entry                         *entryp;
1856         int                                                     count;
1857
1858         /* begin with count of completed async IO requests for this process */
1859         count = procp->aio_done_count;
1860
1861         /* add in count of active async IO requests for this process */
1862         count += procp->aio_active_count;
1863
1864         /* look for matches on our queue of asynchronous todo work */
1865         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1866                 if ( procp == entryp->procp ) {
1867                         count++;
1868                 }
1869         }
1870
1871         /* look for matches on our queue of synchronous todo work */
1872         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1873                 if ( procp == entryp->procp ) {
1874                         count++;
1875                 }
1876         }
1877
1878         return( count );
1879
1880 } /* aio_get_process_count */
1881
1882
1883 /*
1884  * aio_get_all_queues_count - get total number of entries on all aio work queues.
1885  * NOTE - caller must hold aio lock!
1886  */
1887
1888 static int
1889 aio_get_all_queues_count( void )
1890 {
1891         int                                                     count;
1892
1893         count = aio_anchor.aio_async_workq_count;
1894         count += aio_anchor.lio_sync_workq_count;
1895         count += aio_anchor.aio_active_count;
1896         count += aio_anchor.aio_done_count;
1897
1898         return( count );
1899
1900 } /* aio_get_all_queues_count */
1901
1902
1903 /*
1904  * do_aio_completion.  Handle async IO completion.
1905  */
1906
1907 static void
1908 do_aio_completion( aio_workq_entry *entryp )
1909 {
1910         /* signal user land process if appropriate */
1911         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1912                  (entryp->flags & AIO_DISABLE) == 0 ) {
1913
1914                 /*
1915                  * if group_tag is non zero then make sure this is the last IO request
1916                  * in the group before we signal.
1917                  */
1918                 if ( entryp->group_tag == 0 ||
1919                          (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1920                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1921                                                   (int)entryp->procp, (int)entryp->uaiocbp,
1922                                                   entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1923
1924                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1925                         return;
1926                 }
1927         }
1928
1929         /*
1930          * need to handle case where a process is trying to exit, exec, or close
1931          * and is currently waiting for active aio requests to complete.  If
1932          * AIO_WAITING is set then we need to look to see if there are any
1933          * other requests in the active queue for this process.  If there are
1934          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If
1935          * there are some still active then do nothing - we only want to wakeup
1936          * when all active aio requests for the process are complete.
1937          */
1938         if ( (entryp->flags & AIO_WAITING) != 0 ) {
1939                 int             active_requests;
1940
1941                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1942                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1943
1944                 AIO_LOCK;
1945                 active_requests = aio_active_requests_for_process( entryp->procp );
1946                 //AIO_UNLOCK;
1947                 if ( active_requests < 1 ) {
1948                         /* no active aio requests for this process, continue exiting */
1949                         wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1950
1951                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1952                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1953                 }
1954                 AIO_UNLOCK;
1955                 return;
1956         }
1957
1958         /*
1959          * aio_suspend case when a signal was not requested.  In that scenario we
1960          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1961          * NOTE - the assumption here is that this wakeup call is inexpensive.
1962          * we really only need to do this when an aio_suspend call is pending.
1963          * If we find the wakeup call should be avoided we could mark the
1964          * async IO requests given in the list provided by aio_suspend and only
1965          * call wakeup for them.  If we do mark them we should unmark them after
1966          * the aio_suspend wakes up.
1967          */
1968         AIO_LOCK;
1969         wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1970         AIO_UNLOCK;
1971
1972         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1973                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1974
1975         return;
1976
1977 } /* do_aio_completion */
1978
1979
1980 /*
1981  * aio_last_group_io - checks to see if this is the last unfinished IO request
1982  * for the given group_tag.  Returns TRUE if there are no other active IO
1983  * requests for this group or FALSE if the are active IO requests
1984  * NOTE - AIO_LOCK must be held by caller
1985  */
1986
1987 static boolean_t
1988 aio_last_group_io( aio_workq_entry *entryp )
1989 {
1990         aio_workq_entry                         *my_entryp;
1991
1992         /* look for matches on our queue of active async IO requests */
1993         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1994                 if ( my_entryp->group_tag == entryp->group_tag )
1995                         return( FALSE );
1996         }
1997
1998         /* look for matches on our queue of asynchronous todo work */
1999         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2000                 if ( my_entryp->group_tag == entryp->group_tag )
2001                         return( FALSE );
2002         }
2003
2004         /* look for matches on our queue of synchronous todo work */
2005         TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2006                 if ( my_entryp->group_tag == entryp->group_tag )
2007                         return( FALSE );
2008         }
2009
2010         return( TRUE );
2011
2012 } /* aio_last_group_io */
2013
2014
2015 /*
2016  * do_aio_read
2017  */
2018 static int
2019 do_aio_read( aio_workq_entry *entryp )
2020 {
2021         struct fileproc                         *fp;
2022         int                                             error;
2023
2024         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2025                 return(error);
2026         if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2027                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2028                 return(EBADF);
2029         }
2030         if ( fp != NULL ) {
2031                 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
2032                                                         entryp->aiocb.aio_buf,
2033                                                         entryp->aiocb.aio_nbytes,
2034                                                         entryp->aiocb.aio_offset, FOF_OFFSET,
2035                                                         &entryp->returnval );
2036                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2037         }
2038         else {
2039                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2040                 error = EBADF;
2041         }
2042
2043         return( error );
2044
2045 } /* do_aio_read */
2046
2047
2048 /*
2049  * do_aio_write
2050  */
2051 static int
2052 do_aio_write( aio_workq_entry *entryp )
2053 {
2054         struct fileproc                 *fp;
2055         int                                             error;
2056
2057         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2058                 return(error);
2059         if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2060                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2061                 return(EBADF);
2062         }
2063         if ( fp != NULL ) {
2064                 /* NB: tell dofilewrite the offset, and to use the proc cred */
2065                 error = dofilewrite( entryp->procp,
2066                                      fp,
2067                                      entryp->aiocb.aio_fildes,
2068                                      entryp->aiocb.aio_buf,
2069                                      entryp->aiocb.aio_nbytes,
2070                                      entryp->aiocb.aio_offset,
2071                                      FOF_OFFSET | FOF_PCRED,
2072                                      &entryp->returnval);
2073
2074                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2075         }
2076         else {
2077                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2078                 error = EBADF;
2079         }
2080
2081         return( error );
2082
2083 } /* do_aio_write */
2084
2085
2086 /*
2087  * aio_active_requests_for_process - return number of active async IO
2088  * requests for the given process.
2089  * NOTE - caller must hold aio lock!
2090  */
2091
2092 static int
2093 aio_active_requests_for_process( struct proc *procp )
2094 {
2095
2096         return( procp->aio_active_count );
2097
2098 } /* aio_active_requests_for_process */
2099
2100
2101 /*
2102  * do_aio_fsync
2103  */
2104 static int
2105 do_aio_fsync( aio_workq_entry *entryp )
2106 {
2107         struct vfs_context      context;
2108         struct vnode            *vp;
2109         struct fileproc         *fp;
2110         int                                     error;
2111
2112         /*
2113          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2114          * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2115          * The following was shamelessly extracted from fsync() implementation.
2116          */
2117
2118         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2119         if ( error == 0 ) {
2120                 if ( (error = vnode_getwithref(vp)) ) {
2121                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2122                         entryp->returnval = -1;
2123                         return(error);
2124                 }
2125                 context.vc_proc = entryp->procp;
2126                 context.vc_ucred = fp->f_fglob->fg_cred;
2127
2128                 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2129
2130                 (void)vnode_put(vp);
2131
2132                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2133         }
2134         if ( error != 0 )
2135                 entryp->returnval = -1;
2136
2137         return( error );
2138
2139 } /* do_aio_fsync */
2140
2141
2142 /*
2143  * is_already_queued - runs through our queues to see if the given
2144  * aiocbp / process is there.  Returns TRUE if there is a match
2145  * on any of our aio queues.
2146  * NOTE - callers must hold aio lock!
2147  */
2148
2149 static boolean_t
2150 is_already_queued(      struct proc *procp,
2151                                         user_addr_t aiocbp )
2152 {
2153         aio_workq_entry                 *entryp;
2154         boolean_t                               result;
2155
2156         result = FALSE;
2157
2158         /* look for matches on our queue of async IO requests that have completed */
2159         TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2160                 if ( aiocbp == entryp->uaiocbp ) {
2161                         result = TRUE;
2162                         goto ExitThisRoutine;
2163                 }
2164         }
2165
2166         /* look for matches on our queue of active async IO requests */
2167         TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2168                 if ( aiocbp == entryp->uaiocbp ) {
2169                         result = TRUE;
2170                         goto ExitThisRoutine;
2171                 }
2172         }
2173
2174         /* look for matches on our queue of asynchronous todo work */
2175         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2176                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2177                         result = TRUE;
2178                         goto ExitThisRoutine;
2179                 }
2180         }
2181
2182         /* look for matches on our queue of synchronous todo work */
2183         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2184                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2185                         result = TRUE;
2186                         goto ExitThisRoutine;
2187                 }
2188         }
2189
2190 ExitThisRoutine:
2191         return( result );
2192
2193 } /* is_already_queued */
2194
2195
2196 /*
2197  * aio initialization
2198  */
2199 __private_extern__ void
2200 aio_init( void )
2201 {
2202         int                     i;
2203
2204         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2205         aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2206         aio_lock_attr = lck_attr_alloc_init();
2207
2208         aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2209
2210         AIO_LOCK;
2211         TAILQ_INIT( &aio_anchor.aio_async_workq );
2212         TAILQ_INIT( &aio_anchor.lio_sync_workq );
2213         aio_anchor.aio_async_workq_count = 0;
2214         aio_anchor.lio_sync_workq_count = 0;
2215         aio_anchor.aio_active_count = 0;
2216         aio_anchor.aio_done_count = 0;
2217         AIO_UNLOCK;
2218
2219         i = sizeof( aio_workq_entry );
2220         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2221
2222         _aio_create_worker_threads( aio_worker_threads );
2223
2224         return;
2225
2226 } /* aio_init */
2227
2228
2229 /*
2230  * aio worker threads created here.
2231  */
2232 __private_extern__ void
2233 _aio_create_worker_threads( int num )
2234 {
2235         int                     i;
2236
2237         /* create some worker threads to handle the async IO requests */
2238         for ( i = 0; i < num; i++ ) {
2239                 thread_t                myThread;
2240
2241                 myThread = kernel_thread( kernel_task, aio_work_thread );
2242                 if ( THREAD_NULL == myThread ) {
2243                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2244                 }
2245         }
2246
2247         return;
2248
2249 } /* _aio_create_worker_threads */
2250
2251 /*
2252  * Return the current activation utask
2253  */
2254 task_t
2255 get_aiotask(void)
2256 {
2257         return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2258 }
2259
2260
2261 /*
2262  * In the case of an aiocb from a
2263  * 32-bit process we need to expand some longs and pointers to the correct
2264  * sizes in order to let downstream code always work on the same type of
2265  * aiocb (in our case that is a user_aiocb)
2266  */
2267 static void
2268 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2269 {
2270         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2271         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2272         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2273         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2274         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2275         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2276
2277         /* special case here.  since we do not know if sigev_value is an */
2278         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2279         /* means if we send this info back to user space we need to remember */
2280         /* sigev_value was not expanded for the 32-bit case.  */
2281         /* NOTE - this does NOT affect us since we don't support sigev_value */
2282         /* yet in the aio context.  */
2283         //LP64
2284         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2285         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2286         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2287                 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2288         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2289                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2290         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2291                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2292 }