bsd/kern/kern_aio.c

   1 /*
   2  * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29
  30 /*
  31  * todo:
  32  *              1) ramesh is looking into how to replace taking a reference on
  33  *                      the user's map (vm_map_reference()) since it is believed that
  34  *                      would not hold the process for us.
  35  *              2) david is looking into a way for us to set the priority of the
  36  *                      worker threads to match that of the user's thread when the
  37  *                      async IO was queued.
  38  */
  39
  40
  41 /*
  42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  43  */
  44
  45 #include <sys/systm.h>
  46 #include <sys/fcntl.h>
  47 #include <sys/file_internal.h>
  48 #include <sys/filedesc.h>
  49 #include <sys/kernel.h>
  50 #include <sys/vnode_internal.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount_internal.h>
  53 #include <sys/param.h>
  54 #include <sys/proc_internal.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/unistd.h>
  57 #include <sys/user.h>
  58
  59 #include <sys/aio_kern.h>
  60 #include <sys/sysproto.h>
  61
  62 #include <machine/limits.h>
  63
  64 #include <mach/mach_types.h>
  65 #include <kern/kern_types.h>
  66 #include <kern/zalloc.h>
  67 #include <kern/task.h>
  68 #include <kern/sched_prim.h>
  69
  70 #include <vm/vm_map.h>
  71
  72 #include <sys/kdebug.h>
  73 #define AIO_work_queued                                 1
  74 #define AIO_worker_wake                                 2
  75 #define AIO_completion_sig                              3
  76 #define AIO_completion_cleanup_wait             4
  77 #define AIO_completion_cleanup_wake             5
  78 #define AIO_completion_suspend_wake     6
  79 #define AIO_fsync_delay                                 7
  80 #define AIO_cancel                                              10
  81 #define AIO_cancel_async_workq                  11
  82 #define AIO_cancel_sync_workq                   12
  83 #define AIO_cancel_activeq                              13
  84 #define AIO_cancel_doneq                                14
  85 #define AIO_fsync                                               20
  86 #define AIO_read                                                30
  87 #define AIO_write                                               40
  88 #define AIO_listio                                              50
  89 #define AIO_error                                               60
  90 #define AIO_error_val                                   61
  91 #define AIO_error_activeq                               62
  92 #define AIO_error_workq                                 63
  93 #define AIO_return                                              70
  94 #define AIO_return_val                                  71
  95 #define AIO_return_activeq                              72
  96 #define AIO_return_workq                                73
  97 #define AIO_exec                                                80
  98 #define AIO_exit                                                90
  99 #define AIO_exit_sleep                                  91
 100 #define AIO_close                                               100
 101 #define AIO_close_sleep                                 101
 102 #define AIO_suspend                                             110
 103 #define AIO_suspend_sleep                               111
 104 #define AIO_worker_thread                               120
 105
 106 #if 0
 107 #undef KERNEL_DEBUG
 108 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 109 #endif
 110
 111 /*
 112  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
 113  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
 114  * (proc.aio_activeq) when one of our worker threads start the IO.
 115  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
 116  * when the IO request completes.  The request remains on aio_doneq until
 117  * user process calls aio_return or the process exits, either way that is our
 118  * trigger to release aio resources.
 119  */
 120 struct aio_anchor_cb
 121 {
 122         int                                                                     aio_async_workq_count;  /* entries on aio_async_workq */
 123         int                                                                     lio_sync_workq_count;   /* entries on lio_sync_workq */
 124         int                                                                     aio_active_count;       /* entries on all active queues (proc.aio_activeq) */
 125         int                                                                     aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
 126         TAILQ_HEAD( , aio_workq_entry )         aio_async_workq;
 127         TAILQ_HEAD( , aio_workq_entry )         lio_sync_workq;
 128 };
 129 typedef struct aio_anchor_cb aio_anchor_cb;
 130
 131
 132 /*
 133  * Notes on aio sleep / wake channels.
 134  * We currently pick a couple fields within the proc structure that will allow
 135  * us sleep channels that currently do not collide with any other kernel routines.
 136  * At this time, for binary compatibility reasons, we cannot create new proc fields.
 137  */
 138 #define AIO_SUSPEND_SLEEP_CHAN  aio_active_count
 139 #define AIO_CLEANUP_SLEEP_CHAN  aio_done_count
 140
 141
 142 /*
 143  * aysnc IO locking macros used to protect critical sections.
 144  */
 145 #define AIO_LOCK        lck_mtx_lock(aio_lock)
 146 #define AIO_UNLOCK      lck_mtx_unlock(aio_lock)
 147
 148
 149 /*
 150  *  LOCAL PROTOTYPES
 151  */
 152 static int                      aio_active_requests_for_process(proc_t procp );
 153 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
 154 static int                      aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
 155 static int                      aio_get_all_queues_count( void );
 156 static int                      aio_get_process_count(proc_t procp );
 157 static aio_workq_entry *  aio_get_some_work( void );
 158 static boolean_t        aio_last_group_io( aio_workq_entry *entryp );
 159 static void                     aio_mark_requests( aio_workq_entry *entryp );
 160 static int                      aio_queue_async_request(proc_t procp,
 161                                                                                          user_addr_t aiocbp,
 162                                                                                          int kindOfIO );
 163 static int                      aio_validate( aio_workq_entry *entryp );
 164 static void                     aio_work_thread( void );
 165 static int                      do_aio_cancel(proc_t p,
 166                                                                         int fd,
 167                                                                         user_addr_t aiocbp,
 168                                                                         boolean_t wait_for_completion,
 169                                                                         boolean_t disable_notification );
 170 static void                     do_aio_completion( aio_workq_entry *entryp );
 171 static int                      do_aio_fsync( aio_workq_entry *entryp );
 172 static int                      do_aio_read( aio_workq_entry *entryp );
 173 static int                      do_aio_write( aio_workq_entry *entryp );
 174 static void             do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
 175 static boolean_t        is_already_queued(proc_t procp,
 176                                                                                 user_addr_t aiocbp );
 177 static int                      lio_create_async_entry(proc_t procp,
 178                                                                                          user_addr_t aiocbp,
 179                                                                                          user_addr_t sigp,
 180                                                                                          long group_tag,
 181                                                                                          aio_workq_entry **entrypp );
 182 static int                      lio_create_sync_entry(proc_t procp,
 183                                                                                         user_addr_t aiocbp,
 184                                                                                         long group_tag,
 185                                                                                         aio_workq_entry **entrypp );
 186
 187
 188 /*
 189  *  EXTERNAL PROTOTYPES
 190  */
 191
 192 /* in ...bsd/kern/sys_generic.c */
 193 extern int                      dofileread(vfs_context_t ctx, struct fileproc *fp,
 194                                                                 user_addr_t bufp, user_size_t nbyte,
 195                                                                 off_t offset, int flags, user_ssize_t *retval );
 196 extern int                      dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 197                                                                  user_addr_t bufp, user_size_t nbyte, off_t offset,
 198                                                                  int flags, user_ssize_t *retval );
 199
 200 /*
 201  * aio external global variables.
 202  */
 203 extern int aio_max_requests;                            /* AIO_MAX - configurable */
 204 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
 205 extern int aio_worker_threads;                          /* AIO_THREAD_COUNT - configurable */
 206
 207
 208 /*
 209  * aio static variables.
 210  */
 211 static aio_anchor_cb            aio_anchor;
 212 static lck_mtx_t *              aio_lock;
 213 static lck_grp_t *              aio_lock_grp;
 214 static lck_attr_t *             aio_lock_attr;
 215 static lck_grp_attr_t *         aio_lock_grp_attr;
 216 static struct zone              *aio_workq_zonep;
 217
 218
 219
 220
 221 /*
 222  * aio_cancel - attempt to cancel one or more async IO requests currently
 223  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
 224  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
 225  * is NULL then all outstanding async IO request for the given file
 226  * descriptor are cancelled (if possible).
 227  */
 228
 229 int
 230 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
 231 {
 232         struct user_aiocb               my_aiocb;
 233         int                                                     result;
 234
 235         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
 236                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 237
 238         /* quick check to see if there are any async IO requests queued up */
 239         AIO_LOCK;
 240         result = aio_get_all_queues_count( );
 241         AIO_UNLOCK;
 242         if ( result < 1 ) {
 243                 result = 0;
 244                 *retval = AIO_ALLDONE;
 245                 goto ExitRoutine;
 246         }
 247
 248         *retval = -1;
 249         if ( uap->aiocbp != USER_ADDR_NULL ) {
 250                 if ( !IS_64BIT_PROCESS(p) ) {
 251                         struct aiocb aiocb32;
 252
 253                         result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
 254                         if ( result == 0 )
 255                                 do_munge_aiocb( &aiocb32, &my_aiocb );
 256                 } else
 257                         result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
 258
 259                 if ( result != 0 ) {
 260                         result = EAGAIN;
 261                         goto ExitRoutine;
 262                 }
 263
 264                 /* NOTE - POSIX standard says a mismatch between the file */
 265                 /* descriptor passed in and the file descriptor embedded in */
 266                 /* the aiocb causes unspecified results.  We return EBADF in */
 267                 /* that situation.  */
 268                 if ( uap->fd != my_aiocb.aio_fildes ) {
 269                         result = EBADF;
 270                         goto ExitRoutine;
 271                 }
 272         }
 273         result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
 274
 275         if ( result != -1 ) {
 276                 *retval = result;
 277                 result = 0;
 278                 goto ExitRoutine;
 279         }
 280
 281         result = EBADF;
 282
 283 ExitRoutine:
 284         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
 285                           (int)p, (int)uap->aiocbp, result, 0, 0 );
 286
 287         return( result );
 288
 289 } /* aio_cancel */
 290
 291
 292 /*
 293  * _aio_close - internal function used to clean up async IO requests for
 294  * a file descriptor that is closing.
 295  * THIS MAY BLOCK.
 296  */
 297
 298 __private_extern__ void
 299 _aio_close(proc_t p, int fd )
 300 {
 301         int                     error, count;
 302
 303         /* quick check to see if there are any async IO requests queued up */
 304         AIO_LOCK;
 305         count = aio_get_all_queues_count( );
 306         AIO_UNLOCK;
 307         if ( count < 1 )
 308                 return;
 309
 310         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
 311                           (int)p, fd, 0, 0, 0 );
 312
 313         /* cancel all async IO requests on our todo queues for this file descriptor */
 314         error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
 315         if ( error == AIO_NOTCANCELED ) {
 316                 /*
 317                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 318                  * and file descriptor on the active async IO queue.  Active requests cannot
 319                  * be cancelled so we must wait for them to complete.  We will get a special
 320                  * wake up call on our channel used to sleep for ALL active requests to
 321                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
 322                  * when we must wait for all active aio requests.
 323                  */
 324
 325                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
 326                                   (int)p, fd, 0, 0, 0 );
 327
 328                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
 329         }
 330
 331         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
 332                           (int)p, fd, 0, 0, 0 );
 333
 334         return;
 335
 336 } /* _aio_close */
 337
 338
 339 /*
 340  * aio_error - return the error status associated with the async IO
 341  * request referred to by uap->aiocbp.  The error status is the errno
 342  * value that would be set by the corresponding IO request (read, wrtie,
 343  * fdatasync, or sync).
 344  */
 345
 346 int
 347 aio_error(proc_t p, struct aio_error_args *uap, int *retval )
 348 {
 349         aio_workq_entry                         *entryp;
 350         int                                                     error;
 351
 352         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
 353                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 354
 355         AIO_LOCK;
 356
 357         /* quick check to see if there are any async IO requests queued up */
 358         if ( aio_get_all_queues_count( ) < 1 ) {
 359                 error = EINVAL;
 360                 goto ExitRoutine;
 361         }
 362
 363         /* look for a match on our queue of async IO requests that have completed */
 364         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 365                 if ( entryp->uaiocbp == uap->aiocbp ) {
 366                         *retval = entryp->errorval;
 367                         error = 0;
 368                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
 369                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 370                         goto ExitRoutine;
 371                 }
 372         }
 373
 374         /* look for a match on our queue of active async IO requests */
 375         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 376                 if ( entryp->uaiocbp == uap->aiocbp ) {
 377                         *retval = EINPROGRESS;
 378                         error = 0;
 379                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
 380                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 381                         goto ExitRoutine;
 382                 }
 383         }
 384
 385         /* look for a match on our queue of todo work */
 386         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 387                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 388                         *retval = EINPROGRESS;
 389                         error = 0;
 390                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
 391                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 392                         goto ExitRoutine;
 393                 }
 394         }
 395         error = EINVAL;
 396
 397 ExitRoutine:
 398         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
 399                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 400         AIO_UNLOCK;
 401
 402         return( error );
 403
 404 } /* aio_error */
 405
 406
 407 /*
 408  * aio_fsync - asynchronously force all IO operations associated
 409  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
 410  * queued at the time of the call to the synchronized completion state.
 411  * NOTE - we do not support op O_DSYNC at this point since we do not support the
 412  * fdatasync() call.
 413  */
 414
 415 int
 416 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
 417 {
 418         int                     error;
 419         int                     fsync_kind;
 420
 421         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
 422                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
 423
 424         *retval = 0;
 425         /* 0 := O_SYNC for binary backward compatibility with Panther */
 426         if (uap->op == O_SYNC || uap->op == 0)
 427                 fsync_kind = AIO_FSYNC;
 428 #if 0 // we don't support fdatasync() call yet
 429         else if ( uap->op == O_DSYNC )
 430                 fsync_kind = AIO_DSYNC;
 431 #endif
 432         else {
 433                 *retval = -1;
 434                 error = EINVAL;
 435                 goto ExitRoutine;
 436         }
 437
 438         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
 439         if ( error != 0 )
 440                 *retval = -1;
 441
 442 ExitRoutine:
 443         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
 444                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 445
 446         return( error );
 447
 448 } /* aio_fsync */
 449
 450
 451 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
 452  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
 453  * (uap->aiocbp->aio_buf).
 454  */
 455
 456 int
 457 aio_read(proc_t p, struct aio_read_args *uap, int *retval )
 458 {
 459         int                     error;
 460
 461         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
 462                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 463
 464         *retval = 0;
 465
 466         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
 467         if ( error != 0 )
 468                 *retval = -1;
 469
 470         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
 471                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 472
 473         return( error );
 474
 475 } /* aio_read */
 476
 477
 478 /*
 479  * aio_return - return the return status associated with the async IO
 480  * request referred to by uap->aiocbp.  The return status is the value
 481  * that would be returned by corresponding IO request (read, wrtie,
 482  * fdatasync, or sync).  This is where we release kernel resources
 483  * held for async IO call associated with the given aiocb pointer.
 484  */
 485
 486 int
 487 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
 488 {
 489         aio_workq_entry                         *entryp;
 490         int                                                     error;
 491         boolean_t                                       lock_held;
 492
 493         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
 494                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 495
 496         AIO_LOCK;
 497         lock_held = TRUE;
 498         *retval = 0;
 499
 500         /* quick check to see if there are any async IO requests queued up */
 501         if ( aio_get_all_queues_count( ) < 1 ) {
 502                 error = EINVAL;
 503                 goto ExitRoutine;
 504         }
 505
 506         /* look for a match on our queue of async IO requests that have completed */
 507         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 508                 if ( entryp->uaiocbp == uap->aiocbp ) {
 509                         TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 510                         aio_anchor.aio_done_count--;
 511                         p->aio_done_count--;
 512
 513                         *retval = entryp->returnval;
 514
 515                         /* we cannot free requests that are still completing */
 516                         if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 517                                 vm_map_t                my_map;
 518
 519                                 my_map = entryp->aio_map;
 520                                 entryp->aio_map = VM_MAP_NULL;
 521                                 AIO_UNLOCK;
 522                                 lock_held = FALSE;
 523                                 aio_free_request( entryp, my_map );
 524                         }
 525                         else
 526                                 /* tell completion code to free this request */
 527                                 entryp->flags |= AIO_DO_FREE;
 528                         error = 0;
 529                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
 530                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 531                         goto ExitRoutine;
 532                 }
 533         }
 534
 535         /* look for a match on our queue of active async IO requests */
 536         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 537                 if ( entryp->uaiocbp == uap->aiocbp ) {
 538                         error = EINPROGRESS;
 539                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
 540                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 541                         goto ExitRoutine;
 542                 }
 543         }
 544
 545         /* look for a match on our queue of todo work */
 546         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 547                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
 548                         error = EINPROGRESS;
 549                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
 550                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
 551                         goto ExitRoutine;
 552                 }
 553         }
 554         error = EINVAL;
 555
 556 ExitRoutine:
 557         if ( lock_held )
 558                 AIO_UNLOCK;
 559         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
 560                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 561
 562         return( error );
 563
 564 } /* aio_return */
 565
 566
 567 /*
 568  * _aio_exec - internal function used to clean up async IO requests for
 569  * a process that is going away due to exec().  We cancel any async IOs
 570  * we can and wait for those already active.  We also disable signaling
 571  * for cancelled or active aio requests that complete.
 572  * This routine MAY block!
 573  */
 574
 575 __private_extern__ void
 576 _aio_exec(proc_t p )
 577 {
 578
 579         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
 580                           (int)p, 0, 0, 0, 0 );
 581
 582         _aio_exit( p );
 583
 584         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
 585                           (int)p, 0, 0, 0, 0 );
 586
 587         return;
 588
 589 } /* _aio_exec */
 590
 591
 592 /*
 593  * _aio_exit - internal function used to clean up async IO requests for
 594  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
 595  * we can and wait for those already active.  We also disable signaling
 596  * for cancelled or active aio requests that complete.  This routine MAY block!
 597  */
 598
 599 __private_extern__ void
 600 _aio_exit(proc_t p )
 601 {
 602         int                                             error, count;
 603         aio_workq_entry                 *entryp;
 604
 605         /* quick check to see if there are any async IO requests queued up */
 606         AIO_LOCK;
 607         count = aio_get_all_queues_count( );
 608         AIO_UNLOCK;
 609         if ( count < 1 ) {
 610                 return;
 611         }
 612
 613         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
 614                           (int)p, 0, 0, 0, 0 );
 615
 616         /*
 617          * cancel async IO requests on the todo work queue and wait for those
 618          * already active to complete.
 619          */
 620         error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
 621         if ( error == AIO_NOTCANCELED ) {
 622                 /*
 623                  * AIO_NOTCANCELED is returned when we find an aio request for this process
 624                  * on the active async IO queue.  Active requests cannot be cancelled so we
 625                  * must wait for them to complete.  We will get a special wake up call on
 626                  * our channel used to sleep for ALL active requests to complete.  This sleep
 627                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
 628                  * active aio requests.
 629                  */
 630
 631                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
 632                                   (int)p, 0, 0, 0, 0 );
 633
 634                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
 635         }
 636
 637         /* release all aio resources used by this process */
 638         AIO_LOCK;
 639         entryp = TAILQ_FIRST( &p->aio_doneq );
 640         while ( entryp != NULL ) {
 641                 aio_workq_entry                 *next_entryp;
 642
 643                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 644                 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
 645                 aio_anchor.aio_done_count--;
 646                 p->aio_done_count--;
 647
 648                 /* we cannot free requests that are still completing */
 649                 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
 650                         vm_map_t                my_map;
 651
 652                         my_map = entryp->aio_map;
 653                         entryp->aio_map = VM_MAP_NULL;
 654                         AIO_UNLOCK;
 655                         aio_free_request( entryp, my_map );
 656
 657                         /* need to start over since aio_doneq may have been */
 658                         /* changed while we were away.  */
 659                         AIO_LOCK;
 660                         entryp = TAILQ_FIRST( &p->aio_doneq );
 661                         continue;
 662                 }
 663                 else
 664                         /* tell completion code to free this request */
 665                         entryp->flags |= AIO_DO_FREE;
 666                 entryp = next_entryp;
 667         }
 668         AIO_UNLOCK;
 669
 670         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
 671                           (int)p, 0, 0, 0, 0 );
 672
 673         return;
 674
 675 } /* _aio_exit */
 676
 677
 678 /*
 679  * do_aio_cancel - cancel async IO requests (if possible).  We get called by
 680  * aio_cancel, close, and at exit.
 681  * There are three modes of operation: 1) cancel all async IOs for a process -
 682  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
 683  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
 684  * aiocbp.
 685  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
 686  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
 687  * target async IO requests, and AIO_ALLDONE if all target async IO requests
 688  * were already complete.
 689  * WARNING - do not deference aiocbp in this routine, it may point to user
 690  * land data that has not been copied in (when called from aio_cancel() )
 691  */
 692
 693 static int
 694 do_aio_cancel(proc_t p, int fd, user_addr_t aiocbp,
 695                                 boolean_t wait_for_completion, boolean_t disable_notification )
 696 {
 697         aio_workq_entry                 *entryp;
 698         int                                             result;
 699
 700         result = -1;
 701
 702         /* look for a match on our queue of async todo work. */
 703         AIO_LOCK;
 704         entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 705         while ( entryp != NULL ) {
 706                 aio_workq_entry                 *next_entryp;
 707
 708                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 709                 if ( p == entryp->procp ) {
 710                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 711                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 712                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 713                                 /* we found a match so we remove the entry from the */
 714                                 /* todo work queue and place it on the done queue */
 715                                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 716                                 aio_anchor.aio_async_workq_count--;
 717                                 entryp->errorval = ECANCELED;
 718                                 entryp->returnval = -1;
 719                                 if ( disable_notification )
 720                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 721                                 result = AIO_CANCELED;
 722
 723                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
 724                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 725
 726                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 727                                 aio_anchor.aio_done_count++;
 728                                 p->aio_done_count++;
 729                                 entryp->flags |= AIO_COMPLETION;
 730                                 AIO_UNLOCK;
 731
 732                                 /* do completion processing for this request */
 733                                 do_aio_completion( entryp );
 734
 735                                 AIO_LOCK;
 736                                 entryp->flags &= ~AIO_COMPLETION;
 737                                 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
 738                                         vm_map_t                my_map;
 739
 740                                         my_map = entryp->aio_map;
 741                                         entryp->aio_map = VM_MAP_NULL;
 742                                         AIO_UNLOCK;
 743                                         aio_free_request( entryp, my_map );
 744                                 }
 745                                 else
 746                                         AIO_UNLOCK;
 747
 748                                 if ( aiocbp != USER_ADDR_NULL ) {
 749                                         return( result );
 750                                 }
 751
 752                                 /* need to start over since aio_async_workq may have been */
 753                                 /* changed while we were away doing completion processing.  */
 754                                 AIO_LOCK;
 755                                 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 756                                 continue;
 757                         }
 758                 }
 759                 entryp = next_entryp;
 760         } /* while... */
 761
 762         /*
 763          * look for a match on our queue of synchronous todo work.  This will
 764          * be a rare occurrence but could happen if a process is terminated while
 765          * processing a lio_listio call.
 766          */
 767         entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 768         while ( entryp != NULL ) {
 769                 aio_workq_entry                 *next_entryp;
 770
 771                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
 772                 if ( p == entryp->procp ) {
 773                         if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 774                                  (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 775                                  (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 776                                 /* we found a match so we remove the entry from the */
 777                                 /* todo work queue and place it on the done queue */
 778                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 779                                 aio_anchor.lio_sync_workq_count--;
 780                                 entryp->errorval = ECANCELED;
 781                                 entryp->returnval = -1;
 782                                 if ( disable_notification )
 783                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 784                                 result = AIO_CANCELED;
 785
 786                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
 787                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 788
 789                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 790                                 aio_anchor.aio_done_count++;
 791                                 p->aio_done_count++;
 792                                 if ( aiocbp != USER_ADDR_NULL ) {
 793                                         AIO_UNLOCK;
 794                                         return( result );
 795                                 }
 796                         }
 797                 }
 798                 entryp = next_entryp;
 799         } /* while... */
 800
 801         /*
 802          * look for a match on our queue of active async IO requests and
 803          * return AIO_NOTCANCELED result.
 804          */
 805         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
 806                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 807                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 808                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 809                         result = AIO_NOTCANCELED;
 810
 811                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
 812                                                   (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 813
 814                         if ( wait_for_completion )
 815                                 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
 816                         if ( disable_notification )
 817                                 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
 818                         if ( aiocbp != USER_ADDR_NULL ) {
 819                                 AIO_UNLOCK;
 820                                 return( result );
 821                         }
 822                 }
 823         }
 824
 825         /*
 826          * if we didn't find any matches on the todo or active queues then look for a
 827          * match on our queue of async IO requests that have completed and if found
 828          * return AIO_ALLDONE result.
 829          */
 830         if ( result == -1 ) {
 831                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 832                 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
 833                          (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
 834                          (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
 835                                 result = AIO_ALLDONE;
 836
 837                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
 838                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
 839
 840                                 if ( aiocbp != USER_ADDR_NULL ) {
 841                                         AIO_UNLOCK;
 842                                         return( result );
 843                                 }
 844                         }
 845                 }
 846         }
 847         AIO_UNLOCK;
 848
 849         return( result );
 850
 851 } /* do_aio_cancel */
 852
 853
 854 /*
 855  * aio_suspend - suspend the calling thread until at least one of the async
 856  * IO operations referenced by uap->aiocblist has completed, until a signal
 857  * interrupts the function, or uap->timeoutp time interval (optional) has
 858  * passed.
 859  * Returns 0 if one or more async IOs have completed else -1 and errno is
 860  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
 861  * woke us up.
 862  */
 863 int
 864 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
 865 {
 866         __pthread_testcancel(1);
 867         return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
 868 }
 869
 870
 871 int
 872 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
 873 {
 874         int                                     error;
 875         int                                     i, count;
 876         uint64_t                        abstime;
 877         struct user_timespec ts;
 878         aio_workq_entry         *entryp;
 879         user_addr_t                     *aiocbpp;
 880
 881         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
 882                           (int)p, uap->nent, 0, 0, 0 );
 883
 884         *retval = -1;
 885         abstime = 0;
 886         aiocbpp = NULL;
 887
 888         /* quick check to see if there are any async IO requests queued up */
 889         AIO_LOCK;
 890         count = aio_get_all_queues_count( );
 891         AIO_UNLOCK;
 892         if ( count < 1 ) {
 893                 error = EINVAL;
 894                 goto ExitThisRoutine;
 895         }
 896
 897         if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
 898                 error = EINVAL;
 899                 goto ExitThisRoutine;
 900         }
 901
 902         if ( uap->timeoutp != USER_ADDR_NULL ) {
 903                 if ( proc_is64bit(p) ) {
 904                         error = copyin( uap->timeoutp, &ts, sizeof(ts) );
 905                 }
 906                 else {
 907                         struct timespec temp;
 908                         error = copyin( uap->timeoutp, &temp, sizeof(temp) );
 909                         if ( error == 0 ) {
 910                                 ts.tv_sec = temp.tv_sec;
 911                                 ts.tv_nsec = temp.tv_nsec;
 912                         }
 913                 }
 914                 if ( error != 0 ) {
 915                         error = EAGAIN;
 916                         goto ExitThisRoutine;
 917                 }
 918
 919                 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
 920                         error = EINVAL;
 921                         goto ExitThisRoutine;
 922                 }
 923
 924                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
 925                                                                          &abstime );
 926                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
 927         }
 928
 929         /* we reserve enough space for largest possible pointer size */
 930         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
 931         if ( aiocbpp == NULL ) {
 932                 error = EAGAIN;
 933                 goto ExitThisRoutine;
 934         }
 935
 936         /* copyin our aiocb pointers from list */
 937         error = copyin( uap->aiocblist, aiocbpp,
 938                                         proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
 939                                                                         : (uap->nent * sizeof(uintptr_t)) );
 940         if ( error != 0 ) {
 941                 error = EAGAIN;
 942                 goto ExitThisRoutine;
 943         }
 944
 945         /* we depend on a list of user_addr_t's so we need to munge and expand */
 946         /* when these pointers came from a 32-bit process */
 947         if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
 948                 /* position to the last entry and work back from there */
 949                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
 950                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
 951                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
 952                         *my_addrp = (user_addr_t) (*my_ptrp);
 953                 }
 954         }
 955
 956         /* check list of aio requests to see if any have completed */
 957 check_for_our_aiocbp:
 958         AIO_LOCK;
 959         for ( i = 0; i < uap->nent; i++ ) {
 960                 user_addr_t     aiocbp;
 961
 962                 /* NULL elements are legal so check for 'em */
 963                 aiocbp = *(aiocbpp + i);
 964                 if ( aiocbp == USER_ADDR_NULL )
 965                         continue;
 966
 967                 /* return immediately if any aio request in the list is done */
 968                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
 969                         if ( entryp->uaiocbp == aiocbp ) {
 970                                 *retval = 0;
 971                                 error = 0;
 972                                 AIO_UNLOCK;
 973                                 goto ExitThisRoutine;
 974                         }
 975                 }
 976         } /* for ( ; i < uap->nent; ) */
 977
 978         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
 979                           (int)p, uap->nent, 0, 0, 0 );
 980
 981         /*
 982          * wait for an async IO to complete or a signal fires or timeout expires.
 983          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
 984          * interrupts us.  If an async IO completes before a signal fires or our
 985          * timeout expires, we get a wakeup call from aio_work_thread().
 986          */
 987         assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
 988         AIO_UNLOCK;
 989
 990         error = thread_block( THREAD_CONTINUE_NULL );
 991
 992         if ( error == THREAD_AWAKENED ) {
 993                 /*
 994                  * got our wakeup call from aio_work_thread().
 995                  * Since we can get a wakeup on this channel from another thread in the
 996                  * same process we head back up to make sure this is for the correct aiocbp.
 997                  * If it is the correct aiocbp we will return from where we do the check
 998                  * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
 999                  * else we will fall out and just sleep again.
1000                  */
1001                 goto check_for_our_aiocbp;
1002         }
1003         else if ( error == THREAD_TIMED_OUT ) {
1004                 /* our timeout expired */
1005                 error = EAGAIN;
1006         }
1007         else {
1008                 /* we were interrupted */
1009                 error = EINTR;
1010         }
1011
1012 ExitThisRoutine:
1013         if ( aiocbpp != NULL )
1014                 FREE( aiocbpp, M_TEMP );
1015
1016         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1017                           (int)p, uap->nent, error, 0, 0 );
1018
1019         return( error );
1020
1021 } /* aio_suspend */
1022
1023
1024 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1025  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1026  * (uap->aiocbp->aio_buf).
1027  */
1028
1029 int
1030 aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1031 {
1032         int                     error;
1033
1034         *retval = 0;
1035
1036         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1037                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
1038
1039         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1040         if ( error != 0 )
1041                 *retval = -1;
1042
1043         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1044                           (int)p, (int)uap->aiocbp, error, 0, 0 );
1045
1046         return( error );
1047
1048 } /* aio_write */
1049
1050
1051 /*
1052  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs
1053  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1054  * The caller gets error and return status for each aiocb in the list via aio_error
1055  * and aio_return.  We must keep completed requests until released by the
1056  * aio_return call.
1057  */
1058
1059 int
1060 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1061 {
1062         int                                                     i;
1063         int                                                     call_result;
1064         int                                                     result;
1065         long                                            group_tag;
1066         aio_workq_entry *                       *entryp_listp;
1067         user_addr_t                                     *aiocbpp;
1068
1069         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1070                           (int)p, uap->nent, uap->mode, 0, 0 );
1071
1072         entryp_listp = NULL;
1073         aiocbpp = NULL;
1074         call_result = -1;
1075         *retval = -1;
1076         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1077                 call_result = EINVAL;
1078                 goto ExitRoutine;
1079         }
1080
1081         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1082                 call_result = EINVAL;
1083                 goto ExitRoutine;
1084         }
1085
1086         /*
1087          * we use group_tag to mark IO requests for delayed completion processing
1088          * which means we wait until all IO requests in the group have completed
1089          * before we either return to the caller when mode is LIO_WAIT or signal
1090          * user when mode is LIO_NOWAIT.
1091          */
1092         group_tag = random();
1093
1094         /*
1095          * allocate a list of aio_workq_entry pointers that we will use to queue
1096          * up all our requests at once while holding our lock.
1097          */
1098         MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1099         if ( entryp_listp == NULL ) {
1100                 call_result = EAGAIN;
1101                 goto ExitRoutine;
1102         }
1103
1104         /* we reserve enough space for largest possible pointer size */
1105         MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1106         if ( aiocbpp == NULL ) {
1107                 call_result = EAGAIN;
1108                 goto ExitRoutine;
1109         }
1110
1111         /* copyin our aiocb pointers from list */
1112         result = copyin( uap->aiocblist, aiocbpp,
1113                                         IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1114                                                                                 : (uap->nent * sizeof(uintptr_t)) );
1115         if ( result != 0 ) {
1116                 call_result = EAGAIN;
1117                 goto ExitRoutine;
1118         }
1119
1120         /* we depend on a list of user_addr_t's so we need to munge and expand */
1121         /* when these pointers came from a 32-bit process */
1122         if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1123                 /* position to the last entry and work back from there */
1124                 uintptr_t       *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1125                 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1126                 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1127                         *my_addrp = (user_addr_t) (*my_ptrp);
1128                 }
1129         }
1130
1131         /* process list of aio requests */
1132         for ( i = 0; i < uap->nent; i++ ) {
1133                 user_addr_t my_aiocbp;
1134
1135                 *(entryp_listp + i) = NULL;
1136                 my_aiocbp = *(aiocbpp + i);
1137
1138                 /* NULL elements are legal so check for 'em */
1139                 if ( my_aiocbp == USER_ADDR_NULL )
1140                         continue;
1141
1142                 if ( uap->mode == LIO_NOWAIT )
1143                         result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1144                                                                                          group_tag, (entryp_listp + i) );
1145                 else
1146                         result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1147                                                                                         (entryp_listp + i) );
1148
1149                 if ( result != 0 && call_result == -1 )
1150                         call_result = result;
1151         }
1152
1153         /*
1154          * we need to protect this section since we do not want any of these grouped
1155          * IO requests to begin until we have them all on the queue.
1156          */
1157         AIO_LOCK;
1158         for ( i = 0; i < uap->nent; i++ ) {
1159                 aio_workq_entry                         *entryp;
1160
1161                 /* NULL elements are legal so check for 'em */
1162                 entryp = *(entryp_listp + i);
1163                 if ( entryp == NULL )
1164                         continue;
1165
1166                 /* check our aio limits to throttle bad or rude user land behavior */
1167                 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1168                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1169                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1170                         vm_map_t                my_map;
1171
1172                         my_map = entryp->aio_map;
1173                         entryp->aio_map = VM_MAP_NULL;
1174                         if ( call_result == -1 )
1175                                 call_result = EAGAIN;
1176                         AIO_UNLOCK;
1177                         aio_free_request( entryp, my_map );
1178                         AIO_LOCK;
1179                         continue;
1180                 }
1181
1182                 /* place the request on the appropriate queue */
1183                 if ( uap->mode == LIO_NOWAIT ) {
1184                         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1185                         aio_anchor.aio_async_workq_count++;
1186
1187                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1188                                           (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1189                 }
1190                 else {
1191                         TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1192                         aio_anchor.lio_sync_workq_count++;
1193                 }
1194         }
1195
1196         if ( uap->mode == LIO_NOWAIT ) {
1197                 /* caller does not want to wait so we'll fire off a worker thread and return */
1198                 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1199         }
1200         else {
1201                 aio_workq_entry                 *entryp;
1202                 int                                     error;
1203
1204                 /*
1205                  * mode is LIO_WAIT - handle the IO requests now.
1206                  */
1207                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1208                 while ( entryp != NULL ) {
1209                         if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1210
1211                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1212                                 aio_anchor.lio_sync_workq_count--;
1213                                 AIO_UNLOCK;
1214
1215                                 if ( (entryp->flags & AIO_READ) != 0 ) {
1216                                         error = do_aio_read( entryp );
1217                                 }
1218                                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1219                                         error = do_aio_write( entryp );
1220                                 }
1221                                 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1222                                         error = do_aio_fsync( entryp );
1223                                 }
1224                                 else {
1225                                         printf( "%s - unknown aio request - flags 0x%02X \n",
1226                                                         __FUNCTION__, entryp->flags );
1227                                         error = EINVAL;
1228                                 }
1229                                 entryp->errorval = error;
1230                                 if ( error != 0 && call_result == -1 )
1231                                         call_result = EIO;
1232
1233                                 AIO_LOCK;
1234                                 /* we're done with the IO request so move it on the done queue */
1235                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1236                                 aio_anchor.aio_done_count++;
1237                                 p->aio_done_count++;
1238
1239                                 /* need to start over since lio_sync_workq may have been changed while we */
1240                                 /* were away doing the IO.  */
1241                                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1242                                 continue;
1243                         } /* p == entryp->procp */
1244
1245                         entryp = TAILQ_NEXT( entryp, aio_workq_link );
1246         } /* while ( entryp != NULL ) */
1247         } /* uap->mode == LIO_WAIT */
1248         AIO_UNLOCK;
1249
1250         /* call_result == -1 means we had no trouble queueing up requests */
1251         if ( call_result == -1 ) {
1252                 call_result = 0;
1253                 *retval = 0;
1254         }
1255
1256 ExitRoutine:
1257         if ( entryp_listp != NULL )
1258                 FREE( entryp_listp, M_TEMP );
1259         if ( aiocbpp != NULL )
1260                 FREE( aiocbpp, M_TEMP );
1261
1262         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1263                           (int)p, call_result, 0, 0, 0 );
1264
1265         return( call_result );
1266
1267 } /* lio_listio */
1268
1269
1270 /*
1271  * aio worker thread.  this is where all the real work gets done.
1272  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1273  * after new work is queued up.
1274  */
1275
1276 static void
1277 aio_work_thread( void )
1278 {
1279         aio_workq_entry                 *entryp;
1280
1281         for( ;; ) {
1282                 AIO_LOCK;
1283                 entryp = aio_get_some_work();
1284         if ( entryp == NULL ) {
1285                 /*
1286                  * aio worker threads wait for some work to get queued up
1287                  * by aio_queue_async_request.  Once some work gets queued
1288                  * it will wake up one of these worker threads just before
1289                  * returning to our caller in user land.
1290                  */
1291                         assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1292                         AIO_UNLOCK;
1293
1294                         thread_block( (thread_continue_t)aio_work_thread );
1295                         /* NOT REACHED */
1296         }
1297                 else {
1298                         int                     error;
1299                         vm_map_t                currentmap;
1300                         vm_map_t                oldmap = VM_MAP_NULL;
1301                         task_t                  oldaiotask = TASK_NULL;
1302                         struct uthread  *uthreadp = NULL;
1303
1304                         AIO_UNLOCK;
1305
1306                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1307                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1308
1309                         /*
1310                          * Assume the target's address space identity for the duration
1311                          * of the IO.
1312                          */
1313                         currentmap = get_task_map( (current_proc())->task );
1314                         if ( currentmap != entryp->aio_map ) {
1315                                 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1316                                 oldaiotask = uthreadp->uu_aio_task;
1317                                 uthreadp->uu_aio_task = entryp->procp->task;
1318                                 oldmap = vm_map_switch( entryp->aio_map );
1319                         }
1320
1321                         if ( (entryp->flags & AIO_READ) != 0 ) {
1322                                 error = do_aio_read( entryp );
1323                         }
1324                         else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1325                                 error = do_aio_write( entryp );
1326                         }
1327                         else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1328                                 error = do_aio_fsync( entryp );
1329                         }
1330                         else {
1331                                 printf( "%s - unknown aio request - flags 0x%02X \n",
1332                                                 __FUNCTION__, entryp->flags );
1333                                 error = EINVAL;
1334                         }
1335                         entryp->errorval = error;
1336                         if ( currentmap != entryp->aio_map ) {
1337                                 (void) vm_map_switch( oldmap );
1338                                 uthreadp->uu_aio_task = oldaiotask;
1339                         }
1340
1341                         /* we're done with the IO request so pop it off the active queue and */
1342                         /* push it on the done queue */
1343                         AIO_LOCK;
1344                         TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1345                         aio_anchor.aio_active_count--;
1346                         entryp->procp->aio_active_count--;
1347                         TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1348                         aio_anchor.aio_done_count++;
1349                         entryp->procp->aio_done_count++;
1350                         entryp->flags |= AIO_COMPLETION;
1351
1352                         /* remove our reference to the user land map. */
1353                         if ( VM_MAP_NULL != entryp->aio_map ) {
1354                                 vm_map_t                my_map;
1355
1356                                 my_map = entryp->aio_map;
1357                                 entryp->aio_map = VM_MAP_NULL;
1358                                 AIO_UNLOCK;  /* must unlock before calling vm_map_deallocate() */
1359                                 vm_map_deallocate( my_map );
1360                         }
1361                         else {
1362                                 AIO_UNLOCK;
1363                         }
1364
1365                         do_aio_completion( entryp );
1366
1367                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1368                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1369                                                   entryp->returnval, 0 );
1370
1371                         AIO_LOCK;
1372                         entryp->flags &= ~AIO_COMPLETION;
1373                         if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1374                                 vm_map_t                my_map;
1375
1376                                 my_map = entryp->aio_map;
1377                                 entryp->aio_map = VM_MAP_NULL;
1378                                 AIO_UNLOCK;
1379                                 aio_free_request( entryp, my_map );
1380                         }
1381                         else
1382                                 AIO_UNLOCK;
1383                 }
1384         } /* for ( ;; ) */
1385
1386         /* NOT REACHED */
1387
1388 } /* aio_work_thread */
1389
1390
1391 /*
1392  * aio_get_some_work - get the next async IO request that is ready to be executed.
1393  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1394  * IO requests at the time the aio_fsync call came in have completed.
1395  * NOTE - AIO_LOCK must be held by caller
1396  */
1397
1398 static aio_workq_entry *
1399 aio_get_some_work( void )
1400 {
1401         aio_workq_entry                         *entryp;
1402
1403         /* pop some work off the work queue and add to our active queue */
1404         for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1405                   entryp != NULL;
1406                   entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1407
1408                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1409                         /* leave aio_fsync calls on the work queue if there are IO */
1410                         /* requests on the active queue for the same file descriptor. */
1411                         if ( aio_delay_fsync_request( entryp ) ) {
1412
1413                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1414                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1415                                 continue;
1416                         }
1417                 }
1418                 break;
1419         }
1420
1421         if ( entryp != NULL ) {
1422                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1423                 aio_anchor.aio_async_workq_count--;
1424                 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1425                 aio_anchor.aio_active_count++;
1426                 entryp->procp->aio_active_count++;
1427         }
1428
1429         return( entryp );
1430
1431 } /* aio_get_some_work */
1432
1433
1434 /*
1435  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1436  * this time.  Delay will happen when there are any active IOs for the same file
1437  * descriptor that were queued at time the aio_sync call was queued.
1438  * NOTE - AIO_LOCK must be held by caller
1439  */
1440 static boolean_t
1441 aio_delay_fsync_request( aio_workq_entry *entryp )
1442 {
1443         aio_workq_entry                 *my_entryp;
1444
1445         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1446                 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1447                          entryp->uaiocbp == my_entryp->fsyncp &&
1448                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1449                         return( TRUE );
1450                 }
1451         }
1452
1453         return( FALSE );
1454
1455 } /* aio_delay_fsync_request */
1456
1457
1458 /*
1459  * aio_queue_async_request - queue up an async IO request on our work queue then
1460  * wake up one of our worker threads to do the actual work.  We get a reference
1461  * to our caller's user land map in order to keep it around while we are
1462  * processing the request.
1463  */
1464
1465 static int
1466 aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1467 {
1468         aio_workq_entry                 *entryp;
1469         int                                             result;
1470
1471         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1472         if ( entryp == NULL ) {
1473                 result = EAGAIN;
1474                 goto error_exit;
1475         }
1476         bzero( entryp, sizeof(*entryp) );
1477
1478         /* fill in the rest of the aio_workq_entry */
1479         entryp->procp = procp;
1480         entryp->uaiocbp = aiocbp;
1481         entryp->flags |= kindOfIO;
1482         entryp->aio_map = VM_MAP_NULL;
1483
1484         if ( !IS_64BIT_PROCESS(procp) ) {
1485                 struct aiocb aiocb32;
1486
1487                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1488                 if ( result == 0 )
1489                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1490         } else
1491                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1492
1493         if ( result != 0 ) {
1494                 result = EAGAIN;
1495                 goto error_exit;
1496         }
1497
1498         /* do some more validation on the aiocb and embedded file descriptor */
1499         result = aio_validate( entryp );
1500         if ( result != 0 )
1501                 goto error_exit;
1502
1503         /* get a reference to the user land map in order to keep it around */
1504         entryp->aio_map = get_task_map( procp->task );
1505         vm_map_reference( entryp->aio_map );
1506
1507         AIO_LOCK;
1508
1509         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1510                 AIO_UNLOCK;
1511                 result = EAGAIN;
1512                 goto error_exit;
1513         }
1514
1515         /* check our aio limits to throttle bad or rude user land behavior */
1516         if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1517                  aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1518                 AIO_UNLOCK;
1519                 result = EAGAIN;
1520                 goto error_exit;
1521         }
1522
1523         /*
1524          * aio_fsync calls sync up all async IO requests queued at the time
1525          * the aio_fsync call was made.  So we mark each currently queued async
1526          * IO with a matching file descriptor as must complete before we do the
1527          * fsync.  We set the fsyncp field of each matching async IO
1528          * request with the aiocb pointer passed in on the aio_fsync call to
1529          * know which IOs must complete before we process the aio_fsync call.
1530          */
1531         if ( (kindOfIO & AIO_FSYNC) != 0 )
1532                 aio_mark_requests( entryp );
1533
1534         /* queue up on our aio asynchronous work queue */
1535         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1536         aio_anchor.aio_async_workq_count++;
1537
1538         wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1539         AIO_UNLOCK;
1540
1541         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1542                           (int)procp, (int)aiocbp, 0, 0, 0 );
1543
1544         return( 0 );
1545
1546 error_exit:
1547         if ( entryp != NULL ) {
1548                 /* this entry has not been queued up so no worries about unlocked */
1549                 /* state and aio_map */
1550                 aio_free_request( entryp, entryp->aio_map );
1551         }
1552
1553         return( result );
1554
1555 } /* aio_queue_async_request */
1556
1557
1558 /*
1559  * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1560  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1561  * our caller.  We get a reference to our caller's user land map in order to keep
1562  * it around while we are processing the request.
1563  * lio_listio calls behave differently at completion they do completion notification
1564  * when all async IO requests have completed.  We use group_tag to tag IO requests
1565  * that behave in the delay notification manner.
1566  */
1567
1568 static int
1569 lio_create_async_entry(proc_t procp, user_addr_t aiocbp,
1570                                                  user_addr_t sigp, long group_tag,
1571                                                  aio_workq_entry **entrypp )
1572 {
1573         aio_workq_entry                         *entryp;
1574         int                                                     result;
1575
1576         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1577         if ( entryp == NULL ) {
1578                 result = EAGAIN;
1579                 goto error_exit;
1580         }
1581         bzero( entryp, sizeof(*entryp) );
1582
1583         /* fill in the rest of the aio_workq_entry */
1584         entryp->procp = procp;
1585         entryp->uaiocbp = aiocbp;
1586         entryp->flags |= AIO_LIO;
1587         entryp->group_tag = group_tag;
1588         entryp->aio_map = VM_MAP_NULL;
1589
1590         if ( !IS_64BIT_PROCESS(procp) ) {
1591                 struct aiocb aiocb32;
1592
1593                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1594                 if ( result == 0 )
1595                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1596         } else
1597                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1598
1599         if ( result != 0 ) {
1600                 result = EAGAIN;
1601                 goto error_exit;
1602         }
1603
1604         /* look for lio_listio LIO_NOP requests and ignore them. */
1605         /* Not really an error, but we need to free our aio_workq_entry.  */
1606         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1607                 result = 0;
1608                 goto error_exit;
1609         }
1610
1611         /* use sigevent passed in to lio_listio for each of our calls, but only */
1612         /* do completion notification after the last request completes. */
1613         if ( sigp != USER_ADDR_NULL ) {
1614                 if ( !IS_64BIT_PROCESS(procp) ) {
1615                         struct sigevent sigevent32;
1616
1617                         result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1618                         if ( result == 0 ) {
1619                                 /* also need to munge aio_sigevent since it contains pointers */
1620                                 /* special case here.  since we do not know if sigev_value is an */
1621                                 /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
1622                                 /* means if we send this info back to user space we need to remember */
1623                                 /* sigev_value was not expanded for the 32-bit case.  */
1624                                 /* NOTE - this does NOT affect us since we don't support sigev_value */
1625                                 /* yet in the aio context.  */
1626                                 //LP64
1627                                 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1628                                 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1629                                 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1630                                         sigevent32.sigev_value.sival_int;
1631                                 entryp->aiocb.aio_sigevent.sigev_notify_function =
1632                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1633                                 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1634                                         CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1635                         }
1636                 } else
1637                         result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1638
1639                 if ( result != 0 ) {
1640                         result = EAGAIN;
1641                         goto error_exit;
1642                 }
1643         }
1644
1645         /* do some more validation on the aiocb and embedded file descriptor */
1646         result = aio_validate( entryp );
1647         if ( result != 0 )
1648                 goto error_exit;
1649
1650         /* get a reference to the user land map in order to keep it around */
1651         entryp->aio_map = get_task_map( procp->task );
1652         vm_map_reference( entryp->aio_map );
1653
1654         *entrypp = entryp;
1655         return( 0 );
1656
1657 error_exit:
1658         if ( entryp != NULL )
1659                 zfree( aio_workq_zonep, entryp );
1660
1661         return( result );
1662
1663 } /* lio_create_async_entry */
1664
1665
1666 /*
1667  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1668  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp
1669  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp
1670  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1671  * NOTE - AIO_LOCK must be held by caller
1672  */
1673
1674 static void
1675 aio_mark_requests( aio_workq_entry *entryp )
1676 {
1677         aio_workq_entry                 *my_entryp;
1678
1679         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1680                 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1681                         my_entryp->fsyncp = entryp->uaiocbp;
1682                 }
1683         }
1684
1685         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1686                 if ( entryp->procp == my_entryp->procp &&
1687                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1688                         my_entryp->fsyncp = entryp->uaiocbp;
1689                 }
1690         }
1691
1692 } /* aio_mark_requests */
1693
1694
1695 /*
1696  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1697  * If all goes well return 0 and pass the aio_workq_entry pointer back to
1698  * our caller.
1699  * lio_listio calls behave differently at completion they do completion notification
1700  * when all async IO requests have completed.  We use group_tag to tag IO requests
1701  * that behave in the delay notification manner.
1702  */
1703
1704 static int
1705 lio_create_sync_entry(proc_t procp, user_addr_t aiocbp,
1706                                                 long group_tag, aio_workq_entry **entrypp )
1707 {
1708         aio_workq_entry                         *entryp;
1709         int                                                     result;
1710
1711         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1712         if ( entryp == NULL ) {
1713                 result = EAGAIN;
1714                 goto error_exit;
1715         }
1716         bzero( entryp, sizeof(*entryp) );
1717
1718         /* fill in the rest of the aio_workq_entry */
1719         entryp->procp = procp;
1720         entryp->uaiocbp = aiocbp;
1721         entryp->flags |= AIO_LIO;
1722         entryp->group_tag = group_tag;
1723         entryp->aio_map = VM_MAP_NULL;
1724
1725         if ( !IS_64BIT_PROCESS(procp) ) {
1726                 struct aiocb aiocb32;
1727
1728                 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1729                 if ( result == 0 )
1730                         do_munge_aiocb( &aiocb32, &entryp->aiocb );
1731         } else
1732                 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1733
1734         if ( result != 0 ) {
1735                 result = EAGAIN;
1736                 goto error_exit;
1737         }
1738
1739         /* look for lio_listio LIO_NOP requests and ignore them. */
1740         /* Not really an error, but we need to free our aio_workq_entry.  */
1741         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1742                 result = 0;
1743                 goto error_exit;
1744         }
1745
1746         result = aio_validate( entryp );
1747         if ( result != 0 ) {
1748                 goto error_exit;
1749         }
1750
1751         *entrypp = entryp;
1752         return( 0 );
1753
1754 error_exit:
1755         if ( entryp != NULL )
1756                 zfree( aio_workq_zonep, entryp );
1757
1758         return( result );
1759
1760 } /* lio_create_sync_entry */
1761
1762
1763 /*
1764  * aio_free_request - remove our reference on the user land map and
1765  * free the work queue entry resources.
1766  * We are not holding the lock here thus aio_map is passed in and
1767  * zeroed while we did have the lock.
1768  */
1769
1770 static int
1771 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1772 {
1773         /* remove our reference to the user land map. */
1774         if ( VM_MAP_NULL != the_map ) {
1775                 vm_map_deallocate( the_map );
1776         }
1777
1778         zfree( aio_workq_zonep, entryp );
1779
1780         return( 0 );
1781
1782 } /* aio_free_request */
1783
1784
1785 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1786  */
1787
1788 static int
1789 aio_validate( aio_workq_entry *entryp )
1790 {
1791         struct fileproc                                 *fp;
1792         int                                                     flag;
1793         int                                                     result;
1794
1795         result = 0;
1796
1797         if ( (entryp->flags & AIO_LIO) != 0 ) {
1798                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1799                         entryp->flags |= AIO_READ;
1800                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1801                         entryp->flags |= AIO_WRITE;
1802                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1803                         return( 0 );
1804                 else
1805                         return( EINVAL );
1806         }
1807
1808         flag = FREAD;
1809         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1810                 flag = FWRITE;
1811         }
1812
1813         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1814                 // LP64todo - does max value for aio_nbytes need to grow?
1815                 if ( entryp->aiocb.aio_nbytes > INT_MAX         ||
1816                          entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1817                          entryp->aiocb.aio_offset < 0 )
1818                         return( EINVAL );
1819         }
1820
1821         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify
1822          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value,
1823          * sigev_notify_function, and sigev_notify_attributes are ignored.
1824          */
1825         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1826                 int             signum;
1827                 /* make sure we have a valid signal number */
1828                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1829                 if ( signum <= 0 || signum >= NSIG ||
1830                          signum == SIGKILL || signum == SIGSTOP )
1831                         return (EINVAL);
1832         }
1833         else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1834                 return (EINVAL);
1835
1836         /* validate the file descriptor and that the file was opened
1837          * for the appropriate read / write access.
1838          */
1839         proc_fdlock(entryp->procp);
1840
1841         result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1842         if ( result == 0 ) {
1843                 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1844                         /* we don't have read or write access */
1845                         result = EBADF;
1846                 }
1847                 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1848                         /* this is not a file */
1849                         result = ESPIPE;
1850                 } else
1851                         fp->f_flags |= FP_AIOISSUED;
1852
1853                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1854         }
1855         else {
1856                 result = EBADF;
1857         }
1858
1859         proc_fdunlock(entryp->procp);
1860
1861         return( result );
1862
1863 } /* aio_validate */
1864
1865
1866 /*
1867  * aio_get_process_count - runs through our queues that hold outstanding
1868  * async IO reqests and totals up number of requests for the given
1869  * process.
1870  * NOTE - caller must hold aio lock!
1871  */
1872
1873 static int
1874 aio_get_process_count(proc_t procp )
1875 {
1876         aio_workq_entry                         *entryp;
1877         int                                                     count;
1878
1879         /* begin with count of completed async IO requests for this process */
1880         count = procp->aio_done_count;
1881
1882         /* add in count of active async IO requests for this process */
1883         count += procp->aio_active_count;
1884
1885         /* look for matches on our queue of asynchronous todo work */
1886         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1887                 if ( procp == entryp->procp ) {
1888                         count++;
1889                 }
1890         }
1891
1892         /* look for matches on our queue of synchronous todo work */
1893         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1894                 if ( procp == entryp->procp ) {
1895                         count++;
1896                 }
1897         }
1898
1899         return( count );
1900
1901 } /* aio_get_process_count */
1902
1903
1904 /*
1905  * aio_get_all_queues_count - get total number of entries on all aio work queues.
1906  * NOTE - caller must hold aio lock!
1907  */
1908
1909 static int
1910 aio_get_all_queues_count( void )
1911 {
1912         int                                                     count;
1913
1914         count = aio_anchor.aio_async_workq_count;
1915         count += aio_anchor.lio_sync_workq_count;
1916         count += aio_anchor.aio_active_count;
1917         count += aio_anchor.aio_done_count;
1918
1919         return( count );
1920
1921 } /* aio_get_all_queues_count */
1922
1923
1924 /*
1925  * do_aio_completion.  Handle async IO completion.
1926  */
1927
1928 static void
1929 do_aio_completion( aio_workq_entry *entryp )
1930 {
1931         /* signal user land process if appropriate */
1932         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1933                  (entryp->flags & AIO_DISABLE) == 0 ) {
1934
1935                 /*
1936                  * if group_tag is non zero then make sure this is the last IO request
1937                  * in the group before we signal.
1938                  */
1939                 if ( entryp->group_tag == 0 ||
1940                          (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1941                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1942                                                   (int)entryp->procp, (int)entryp->uaiocbp,
1943                                                   entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1944
1945                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1946                         return;
1947                 }
1948         }
1949
1950         /*
1951          * need to handle case where a process is trying to exit, exec, or close
1952          * and is currently waiting for active aio requests to complete.  If
1953          * AIO_WAITING is set then we need to look to see if there are any
1954          * other requests in the active queue for this process.  If there are
1955          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If
1956          * there are some still active then do nothing - we only want to wakeup
1957          * when all active aio requests for the process are complete.
1958          */
1959         if ( (entryp->flags & AIO_WAITING) != 0 ) {
1960                 int             active_requests;
1961
1962                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1963                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1964
1965                 AIO_LOCK;
1966                 active_requests = aio_active_requests_for_process( entryp->procp );
1967                 //AIO_UNLOCK;
1968                 if ( active_requests < 1 ) {
1969                         /* no active aio requests for this process, continue exiting */
1970                         wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1971
1972                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1973                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1974                 }
1975                 AIO_UNLOCK;
1976                 return;
1977         }
1978
1979         /*
1980          * aio_suspend case when a signal was not requested.  In that scenario we
1981          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1982          * NOTE - the assumption here is that this wakeup call is inexpensive.
1983          * we really only need to do this when an aio_suspend call is pending.
1984          * If we find the wakeup call should be avoided we could mark the
1985          * async IO requests given in the list provided by aio_suspend and only
1986          * call wakeup for them.  If we do mark them we should unmark them after
1987          * the aio_suspend wakes up.
1988          */
1989         AIO_LOCK;
1990         wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1991         AIO_UNLOCK;
1992
1993         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1994                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1995
1996         return;
1997
1998 } /* do_aio_completion */
1999
2000
2001 /*
2002  * aio_last_group_io - checks to see if this is the last unfinished IO request
2003  * for the given group_tag.  Returns TRUE if there are no other active IO
2004  * requests for this group or FALSE if the are active IO requests
2005  * NOTE - AIO_LOCK must be held by caller
2006  */
2007
2008 static boolean_t
2009 aio_last_group_io( aio_workq_entry *entryp )
2010 {
2011         aio_workq_entry                         *my_entryp;
2012
2013         /* look for matches on our queue of active async IO requests */
2014         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
2015                 if ( my_entryp->group_tag == entryp->group_tag )
2016                         return( FALSE );
2017         }
2018
2019         /* look for matches on our queue of asynchronous todo work */
2020         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2021                 if ( my_entryp->group_tag == entryp->group_tag )
2022                         return( FALSE );
2023         }
2024
2025         /* look for matches on our queue of synchronous todo work */
2026         TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2027                 if ( my_entryp->group_tag == entryp->group_tag )
2028                         return( FALSE );
2029         }
2030
2031         return( TRUE );
2032
2033 } /* aio_last_group_io */
2034
2035
2036 /*
2037  * do_aio_read
2038  */
2039 static int
2040 do_aio_read( aio_workq_entry *entryp )
2041 {
2042         struct fileproc         *fp;
2043         int                                     error;
2044         struct vfs_context      context;
2045
2046         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2047                 return(error);
2048         if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2049                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2050                 return(EBADF);
2051         }
2052
2053         /*
2054          * <rdar://4714366>
2055          * Needs vfs_context_t from vfs_context_create() in entryp!
2056          */
2057         context.vc_thread = proc_thread(entryp->procp); /* XXX */
2058         context.vc_ucred = fp->f_fglob->fg_cred;
2059
2060         error = dofileread(&context, fp,
2061                                 entryp->aiocb.aio_buf,
2062                                 entryp->aiocb.aio_nbytes,
2063                                 entryp->aiocb.aio_offset, FOF_OFFSET,
2064                                 &entryp->returnval);
2065         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2066
2067         return( error );
2068
2069 } /* do_aio_read */
2070
2071
2072 /*
2073  * do_aio_write
2074  */
2075 static int
2076 do_aio_write( aio_workq_entry *entryp )
2077 {
2078         struct fileproc                 *fp;
2079         int                                             error;
2080         struct vfs_context              context;
2081
2082         if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2083                 return(error);
2084         if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2085                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2086                 return(EBADF);
2087         }
2088
2089         /*
2090          * <rdar://4714366>
2091          * Needs vfs_context_t from vfs_context_create() in entryp!
2092          */
2093         context.vc_thread = proc_thread(entryp->procp); /* XXX */
2094         context.vc_ucred = fp->f_fglob->fg_cred;
2095
2096         /* NB: tell dofilewrite the offset, and to use the proc cred */
2097         error = dofilewrite(&context,
2098                                 fp,
2099                                 entryp->aiocb.aio_buf,
2100                                 entryp->aiocb.aio_nbytes,
2101                                 entryp->aiocb.aio_offset,
2102                                 FOF_OFFSET | FOF_PCRED,
2103                                 &entryp->returnval);
2104
2105         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2106
2107         return( error );
2108
2109 } /* do_aio_write */
2110
2111
2112 /*
2113  * aio_active_requests_for_process - return number of active async IO
2114  * requests for the given process.
2115  * NOTE - caller must hold aio lock!
2116  */
2117
2118 static int
2119 aio_active_requests_for_process(proc_t procp )
2120 {
2121
2122         return( procp->aio_active_count );
2123
2124 } /* aio_active_requests_for_process */
2125
2126
2127 /*
2128  * do_aio_fsync
2129  */
2130 static int
2131 do_aio_fsync( aio_workq_entry *entryp )
2132 {
2133         struct vfs_context      context;
2134         struct vnode            *vp;
2135         struct fileproc         *fp;
2136         int                                     error;
2137
2138         /*
2139          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2140          * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2141          * The following was shamelessly extracted from fsync() implementation.
2142          */
2143
2144         error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2145         if ( error == 0 ) {
2146                 if ( (error = vnode_getwithref(vp)) ) {
2147                         fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2148                         entryp->returnval = -1;
2149                         return(error);
2150                 }
2151                 context.vc_thread = current_thread();
2152                 context.vc_ucred = fp->f_fglob->fg_cred;
2153
2154                 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2155
2156                 (void)vnode_put(vp);
2157
2158                 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2159         }
2160         if ( error != 0 )
2161                 entryp->returnval = -1;
2162
2163         return( error );
2164
2165 } /* do_aio_fsync */
2166
2167
2168 /*
2169  * is_already_queued - runs through our queues to see if the given
2170  * aiocbp / process is there.  Returns TRUE if there is a match
2171  * on any of our aio queues.
2172  * NOTE - callers must hold aio lock!
2173  */
2174
2175 static boolean_t
2176 is_already_queued(proc_t procp,
2177                                         user_addr_t aiocbp )
2178 {
2179         aio_workq_entry                 *entryp;
2180         boolean_t                               result;
2181
2182         result = FALSE;
2183
2184         /* look for matches on our queue of async IO requests that have completed */
2185         TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2186                 if ( aiocbp == entryp->uaiocbp ) {
2187                         result = TRUE;
2188                         goto ExitThisRoutine;
2189                 }
2190         }
2191
2192         /* look for matches on our queue of active async IO requests */
2193         TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2194                 if ( aiocbp == entryp->uaiocbp ) {
2195                         result = TRUE;
2196                         goto ExitThisRoutine;
2197                 }
2198         }
2199
2200         /* look for matches on our queue of asynchronous todo work */
2201         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2202                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2203                         result = TRUE;
2204                         goto ExitThisRoutine;
2205                 }
2206         }
2207
2208         /* look for matches on our queue of synchronous todo work */
2209         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2210                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2211                         result = TRUE;
2212                         goto ExitThisRoutine;
2213                 }
2214         }
2215
2216 ExitThisRoutine:
2217         return( result );
2218
2219 } /* is_already_queued */
2220
2221
2222 /*
2223  * aio initialization
2224  */
2225 __private_extern__ void
2226 aio_init( void )
2227 {
2228         int                     i;
2229
2230         aio_lock_grp_attr = lck_grp_attr_alloc_init();
2231         aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2232         aio_lock_attr = lck_attr_alloc_init();
2233
2234         aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2235
2236         AIO_LOCK;
2237         TAILQ_INIT( &aio_anchor.aio_async_workq );
2238         TAILQ_INIT( &aio_anchor.lio_sync_workq );
2239         aio_anchor.aio_async_workq_count = 0;
2240         aio_anchor.lio_sync_workq_count = 0;
2241         aio_anchor.aio_active_count = 0;
2242         aio_anchor.aio_done_count = 0;
2243         AIO_UNLOCK;
2244
2245         i = sizeof( aio_workq_entry );
2246         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2247
2248         _aio_create_worker_threads( aio_worker_threads );
2249
2250         return;
2251
2252 } /* aio_init */
2253
2254
2255 /*
2256  * aio worker threads created here.
2257  */
2258 __private_extern__ void
2259 _aio_create_worker_threads( int num )
2260 {
2261         int                     i;
2262
2263         /* create some worker threads to handle the async IO requests */
2264         for ( i = 0; i < num; i++ ) {
2265                 thread_t                myThread;
2266
2267                 myThread = kernel_thread( kernel_task, aio_work_thread );
2268                 if ( THREAD_NULL == myThread ) {
2269                         printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2270                 }
2271         }
2272
2273         return;
2274
2275 } /* _aio_create_worker_threads */
2276
2277 /*
2278  * Return the current activation utask
2279  */
2280 task_t
2281 get_aiotask(void)
2282 {
2283         return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2284 }
2285
2286
2287 /*
2288  * In the case of an aiocb from a
2289  * 32-bit process we need to expand some longs and pointers to the correct
2290  * sizes in order to let downstream code always work on the same type of
2291  * aiocb (in our case that is a user_aiocb)
2292  */
2293 static void
2294 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2295 {
2296         the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2297         the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2298         the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2299         the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2300         the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2301         the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2302
2303         /* special case here.  since we do not know if sigev_value is an */
2304         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2305         /* means if we send this info back to user space we need to remember */
2306         /* sigev_value was not expanded for the 32-bit case.  */
2307         /* NOTE - this does NOT affect us since we don't support sigev_value */
2308         /* yet in the aio context.  */
2309         //LP64
2310         the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2311         the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2312         the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2313                 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2314         the_user_aiocbp->aio_sigevent.sigev_notify_function =
2315                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2316         the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2317                 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2318 }