]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_aio.c
xnu-517.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
CommitLineData
55e303ae
A
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25
26
27/*
28 * todo:
29 * 1) ramesh is looking into how to replace taking a reference on
30 * the user's map (vm_map_reference()) since it is believed that
31 * would not hold the process for us.
32 * 2) david is looking into a way for us to set the priority of the
33 * worker threads to match that of the user's thread when the
34 * async IO was queued.
35 */
36
37
38/*
39 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
40 */
41
42#include <sys/systm.h>
43#include <sys/buf.h>
44#include <sys/fcntl.h>
45#include <sys/file.h>
46#include <sys/filedesc.h>
47#include <sys/kernel.h>
48#include <sys/vnode.h>
49#include <sys/malloc.h>
50#include <sys/mount.h>
51#include <sys/param.h>
52#include <sys/proc.h>
53#include <sys/sysctl.h>
54#include <sys/unistd.h>
55#include <sys/user.h>
56
57#include <sys/aio_kern.h>
58
59#include <machine/limits.h>
60#include <kern/zalloc.h>
61#include <kern/task.h>
62
63#include <sys/kdebug.h>
64#define AIO_work_queued 1
65#define AIO_worker_wake 2
66#define AIO_completion_sig 3
67#define AIO_completion_cleanup_wait 4
68#define AIO_completion_cleanup_wake 5
69#define AIO_completion_suspend_wake 6
70#define AIO_fsync_delay 7
71#define AIO_cancel 10
72#define AIO_cancel_async_workq 11
73#define AIO_cancel_sync_workq 12
74#define AIO_cancel_activeq 13
75#define AIO_cancel_doneq 14
76#define AIO_fsync 20
77#define AIO_read 30
78#define AIO_write 40
79#define AIO_listio 50
80#define AIO_error 60
81#define AIO_error_val 61
82#define AIO_error_activeq 62
83#define AIO_error_workq 63
84#define AIO_return 70
85#define AIO_return_val 71
86#define AIO_return_activeq 72
87#define AIO_return_workq 73
88#define AIO_exec 80
89#define AIO_exit 90
90#define AIO_exit_sleep 91
91#define AIO_close 100
92#define AIO_close_sleep 101
93#define AIO_suspend 110
94#define AIO_suspend_sleep 111
95#define AIO_worker_thread 120
96
97#if 0
98#undef KERNEL_DEBUG
99#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
100#endif
101
102/*
103 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
104 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
105 * (proc.aio_activeq) when one of our worker threads start the IO.
106 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
107 * when the IO request completes. The request remains on aio_doneq until
108 * user process calls aio_return or the process exits, either way that is our
109 * trigger to release aio resources.
110 */
111struct aio_anchor_cb
112{
113 int aio_async_workq_count; /* entries on aio_async_workq */
114 int lio_sync_workq_count; /* entries on lio_sync_workq */
115 int aio_active_count; /* entries on all active queues (proc.aio_activeq) */
116 int aio_done_count; /* entries on all done queues (proc.aio_doneq) */
117 TAILQ_HEAD( , aio_workq_entry ) aio_async_workq;
118 TAILQ_HEAD( , aio_workq_entry ) lio_sync_workq;
119};
120typedef struct aio_anchor_cb aio_anchor_cb;
121
122
123/*
124 * Notes on aio sleep / wake channels.
125 * We currently pick a couple fields within the proc structure that will allow
126 * us sleep channels that currently do not collide with any other kernel routines.
127 * At this time, for binary compatibility reasons, we cannot create new proc fields.
128 */
129#define AIO_SUSPEND_SLEEP_CHAN p_estcpu
130#define AIO_CLEANUP_SLEEP_CHAN p_pctcpu
131
132
133/*
134 * aysnc IO locking macros used to protect critical sections.
135 */
136#define AIO_LOCK usimple_lock( &aio_lock )
137#define AIO_UNLOCK usimple_unlock( &aio_lock )
138
139
140/*
141 * LOCAL PROTOTYPES
142 */
143static int aio_active_requests_for_process( struct proc *procp );
144static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
145static int aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
146static int aio_get_all_queues_count( void );
147static int aio_get_process_count( struct proc *procp );
148static aio_workq_entry * aio_get_some_work( void );
149static boolean_t aio_last_group_io( aio_workq_entry *entryp );
150static void aio_mark_requests( aio_workq_entry *entryp );
151static int aio_queue_async_request( struct proc *procp,
152 struct aiocb *aiocbp,
153 int kindOfIO );
154static int aio_validate( aio_workq_entry *entryp );
155static void aio_work_thread( void );
156static int do_aio_cancel( struct proc *p,
157 int fd,
158 struct aiocb *aiocbp,
159 boolean_t wait_for_completion,
160 boolean_t disable_notification );
161static void do_aio_completion( aio_workq_entry *entryp );
162static int do_aio_fsync( aio_workq_entry *entryp );
163static int do_aio_read( aio_workq_entry *entryp );
164static int do_aio_write( aio_workq_entry *entryp );
165static boolean_t is_already_queued( struct proc *procp,
166 struct aiocb *aiocbp );
167static int lio_create_async_entry( struct proc *procp,
168 struct aiocb *aiocbp,
169 struct sigevent *sigp,
170 long group_tag,
171 aio_workq_entry **entrypp );
172static int lio_create_sync_entry( struct proc *procp,
173 struct aiocb *aiocbp,
174 long group_tag,
175 aio_workq_entry **entrypp );
176
177/*
178 * EXTERNAL PROTOTYPES
179 */
180
181/* in ...bsd/kern/sys_generic.c */
182extern struct file* holdfp( struct filedesc* fdp, int fd, int flag );
183extern int dofileread( struct proc *p, struct file *fp, int fd,
184 void *buf, size_t nbyte, off_t offset,
185 int flags, int *retval );
186extern int dofilewrite( struct proc *p, struct file *fp, int fd,
187 const void *buf, size_t nbyte, off_t offset,
188 int flags, int *retval );
189extern vm_map_t vm_map_switch( vm_map_t map );
190
191
192/*
193 * aio external global variables.
194 */
195extern int aio_max_requests; /* AIO_MAX - configurable */
196extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
197extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
198
199
200/*
201 * aio static variables.
202 */
203static aio_anchor_cb aio_anchor;
204static simple_lock_data_t aio_lock;
205static struct zone *aio_workq_zonep;
206
207
208/*
209 * syscall input parameters
210 */
211#ifndef _SYS_SYSPROTO_H_
212
213struct aio_cancel_args {
214 int fd;
215 struct aiocb *aiocbp;
216};
217
218struct aio_error_args {
219 struct aiocb *aiocbp;
220};
221
222struct aio_fsync_args {
223 int op;
224 struct aiocb *aiocbp;
225};
226
227struct aio_read_args {
228 struct aiocb *aiocbp;
229};
230
231struct aio_return_args {
232 struct aiocb *aiocbp;
233};
234
235struct aio_suspend_args {
236 struct aiocb *const *aiocblist;
237 int nent;
238 const struct timespec *timeoutp;
239};
240
241struct aio_write_args {
242 struct aiocb *aiocbp;
243};
244
245struct lio_listio_args {
246 int mode;
247 struct aiocb *const *aiocblist;
248 int nent;
249 struct sigevent *sigp;
250};
251
252#endif /* _SYS_SYSPROTO_H_ */
253
254
255/*
256 * aio_cancel - attempt to cancel one or more async IO requests currently
257 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
258 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
259 * is NULL then all outstanding async IO request for the given file
260 * descriptor are cancelled (if possible).
261 */
262
263int
264aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
265{
266 struct aiocb my_aiocb;
267 int result;
268 boolean_t funnel_state;
269
270 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
271 (int)p, (int)uap->aiocbp, 0, 0, 0 );
272
273 /* quick check to see if there are any async IO requests queued up */
274 AIO_LOCK;
275 result = aio_get_all_queues_count( );
276 AIO_UNLOCK;
277 if ( result < 1 ) {
278 result = EBADF;
279 goto ExitRoutine;
280 }
281
282 *retval = -1;
283 if ( uap->aiocbp != NULL ) {
284 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
285 if ( result != 0 ) {
286 result = EAGAIN;
287 goto ExitRoutine;
288 }
289
290 /* NOTE - POSIX standard says a mismatch between the file */
291 /* descriptor passed in and the file descriptor embedded in */
292 /* the aiocb causes unspecified results. We return EBADF in */
293 /* that situation. */
294 if ( uap->fd != my_aiocb.aio_fildes ) {
295 result = EBADF;
296 goto ExitRoutine;
297 }
298 }
299
300 /* current BSD code assumes funnel lock is held */
301 funnel_state = thread_funnel_set( kernel_flock, TRUE );
302 result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
303 (void) thread_funnel_set( kernel_flock, funnel_state );
304
305 if ( result != -1 ) {
306 *retval = result;
307 result = 0;
308 goto ExitRoutine;
309 }
310
311 result = EBADF;
312
313ExitRoutine:
314 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
315 (int)p, (int)uap->aiocbp, result, 0, 0 );
316
317 return( result );
318
319} /* aio_cancel */
320
321
322/*
323 * _aio_close - internal function used to clean up async IO requests for
324 * a file descriptor that is closing.
325 * NOTE - kernel funnel lock is held when we get called.
326 * THIS MAY BLOCK.
327 */
328
329__private_extern__ void
330_aio_close( struct proc *p, int fd )
331{
332 int error, count;
333
334 /* quick check to see if there are any async IO requests queued up */
335 AIO_LOCK;
336 count = aio_get_all_queues_count( );
337 AIO_UNLOCK;
338 if ( count < 1 )
339 return;
340
341 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
342 (int)p, fd, 0, 0, 0 );
343
344 /* cancel all async IO requests on our todo queues for this file descriptor */
345 error = do_aio_cancel( p, fd, NULL, TRUE, FALSE );
346 if ( error == AIO_NOTCANCELED ) {
347 /*
348 * AIO_NOTCANCELED is returned when we find an aio request for this process
349 * and file descriptor on the active async IO queue. Active requests cannot
350 * be cancelled so we must wait for them to complete. We will get a special
351 * wake up call on our channel used to sleep for ALL active requests to
352 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
353 * when we must wait for all active aio requests.
354 */
355
356 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
357 (int)p, fd, 0, 0, 0 );
358
359 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
360 }
361
362 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
363 (int)p, fd, 0, 0, 0 );
364
365 return;
366
367} /* _aio_close */
368
369
370/*
371 * aio_error - return the error status associated with the async IO
372 * request referred to by uap->aiocbp. The error status is the errno
373 * value that would be set by the corresponding IO request (read, wrtie,
374 * fdatasync, or sync).
375 */
376
377int
378aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
379{
380 aio_workq_entry *entryp;
381 int error;
382
383 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
384 (int)p, (int)uap->aiocbp, 0, 0, 0 );
385
386 AIO_LOCK;
387
388 /* quick check to see if there are any async IO requests queued up */
389 if ( aio_get_all_queues_count( ) < 1 ) {
390 error = EINVAL;
391 goto ExitRoutine;
392 }
393
394 /* look for a match on our queue of async IO requests that have completed */
395 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
396 if ( entryp->uaiocbp == uap->aiocbp ) {
397 *retval = entryp->errorval;
398 error = 0;
399 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
400 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
401 goto ExitRoutine;
402 }
403 }
404
405 /* look for a match on our queue of active async IO requests */
406 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
407 if ( entryp->uaiocbp == uap->aiocbp ) {
408 *retval = EINPROGRESS;
409 error = 0;
410 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
411 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
412 goto ExitRoutine;
413 }
414 }
415
416 /* look for a match on our queue of todo work */
417 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
418 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
419 *retval = EINPROGRESS;
420 error = 0;
421 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
422 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
423 goto ExitRoutine;
424 }
425 }
426 error = EINVAL;
427
428ExitRoutine:
429 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
430 (int)p, (int)uap->aiocbp, error, 0, 0 );
431 AIO_UNLOCK;
432
433 return( error );
434
435} /* aio_error */
436
437
438/*
439 * aio_fsync - asynchronously force all IO operations associated
440 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
441 * queued at the time of the call to the synchronized completion state.
442 * NOTE - we do not support op O_DSYNC at this point since we do not support the
443 * fdatasync() call.
444 */
445
446int
447aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
448{
449 int error;
450 int fsync_kind;
451
452 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
453 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
454
455 *retval = 0;
456 if ( uap->op == O_SYNC )
457 fsync_kind = AIO_FSYNC;
458#if 0 // we don't support fdatasync() call yet
459 else if ( uap->op == O_DSYNC )
460 fsync_kind = AIO_DSYNC;
461#endif
462 else {
463 *retval = -1;
464 error = EINVAL;
465 goto ExitRoutine;
466 }
467
468 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
469 if ( error != 0 )
470 *retval = -1;
471
472ExitRoutine:
473 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
474 (int)p, (int)uap->aiocbp, error, 0, 0 );
475
476 return( error );
477
478} /* aio_fsync */
479
480
481/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
482 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
483 * (uap->aiocbp->aio_buf).
484 */
485
486int
487aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
488{
489 int error;
490
491 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
492 (int)p, (int)uap->aiocbp, 0, 0, 0 );
493
494 *retval = 0;
495
496 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
497 if ( error != 0 )
498 *retval = -1;
499
500 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
501 (int)p, (int)uap->aiocbp, error, 0, 0 );
502
503 return( error );
504
505} /* aio_read */
506
507
508/*
509 * aio_return - return the return status associated with the async IO
510 * request referred to by uap->aiocbp. The return status is the value
511 * that would be returned by corresponding IO request (read, wrtie,
512 * fdatasync, or sync). This is where we release kernel resources
513 * held for async IO call associated with the given aiocb pointer.
514 */
515
516int
517aio_return( struct proc *p, struct aio_return_args *uap, register_t *retval )
518{
519 aio_workq_entry *entryp;
520 int error;
521 boolean_t lock_held;
522
523 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
524 (int)p, (int)uap->aiocbp, 0, 0, 0 );
525
526 AIO_LOCK;
527 lock_held = TRUE;
528 *retval = 0;
529
530 /* quick check to see if there are any async IO requests queued up */
531 if ( aio_get_all_queues_count( ) < 1 ) {
532 error = EINVAL;
533 goto ExitRoutine;
534 }
535
536 /* look for a match on our queue of async IO requests that have completed */
537 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
538 if ( entryp->uaiocbp == uap->aiocbp ) {
539 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
540 aio_anchor.aio_done_count--;
541 p->aio_done_count--;
542
543 *retval = entryp->returnval;
544
545 /* we cannot free requests that are still completing */
546 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
547 vm_map_t my_map;
548
549 my_map = entryp->aio_map;
550 entryp->aio_map = VM_MAP_NULL;
551 AIO_UNLOCK;
552 lock_held = FALSE;
553 aio_free_request( entryp, my_map );
554 }
555 else
556 /* tell completion code to free this request */
557 entryp->flags |= AIO_DO_FREE;
558 error = 0;
559 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
560 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
561 goto ExitRoutine;
562 }
563 }
564
565 /* look for a match on our queue of active async IO requests */
566 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
567 if ( entryp->uaiocbp == uap->aiocbp ) {
568 error = EINPROGRESS;
569 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
570 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
571 goto ExitRoutine;
572 }
573 }
574
575 /* look for a match on our queue of todo work */
576 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
577 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
578 error = EINPROGRESS;
579 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
580 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
581 goto ExitRoutine;
582 }
583 }
584 error = EINVAL;
585
586ExitRoutine:
587 if ( lock_held )
588 AIO_UNLOCK;
589 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
590 (int)p, (int)uap->aiocbp, error, 0, 0 );
591
592 return( error );
593
594} /* aio_return */
595
596
597/*
598 * _aio_exec - internal function used to clean up async IO requests for
599 * a process that is going away due to exec(). We cancel any async IOs
600 * we can and wait for those already active. We also disable signaling
601 * for cancelled or active aio requests that complete.
602 * NOTE - kernel funnel lock is held when we get called.
603 * This routine MAY block!
604 */
605
606__private_extern__ void
607_aio_exec( struct proc *p )
608{
609
610 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
611 (int)p, 0, 0, 0, 0 );
612
613 _aio_exit( p );
614
615 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
616 (int)p, 0, 0, 0, 0 );
617
618 return;
619
620} /* _aio_exec */
621
622
623/*
624 * _aio_exit - internal function used to clean up async IO requests for
625 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
626 * we can and wait for those already active. We also disable signaling
627 * for cancelled or active aio requests that complete. This routine MAY block!
628 * NOTE - kernel funnel lock is held when we get called.
629 */
630
631__private_extern__ void
632_aio_exit( struct proc *p )
633{
634 int error, count;
635 aio_workq_entry *entryp;
636
637 /* quick check to see if there are any async IO requests queued up */
638 AIO_LOCK;
639 count = aio_get_all_queues_count( );
640 AIO_UNLOCK;
641 if ( count < 1 ) {
642 return;
643 }
644
645 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
646 (int)p, 0, 0, 0, 0 );
647
648 /*
649 * cancel async IO requests on the todo work queue and wait for those
650 * already active to complete.
651 */
652 error = do_aio_cancel( p, 0, NULL, TRUE, TRUE );
653 if ( error == AIO_NOTCANCELED ) {
654 /*
655 * AIO_NOTCANCELED is returned when we find an aio request for this process
656 * on the active async IO queue. Active requests cannot be cancelled so we
657 * must wait for them to complete. We will get a special wake up call on
658 * our channel used to sleep for ALL active requests to complete. This sleep
659 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
660 * active aio requests.
661 */
662
663 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
664 (int)p, 0, 0, 0, 0 );
665
666 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
667 }
668
669 /* release all aio resources used by this process */
670 AIO_LOCK;
671 entryp = TAILQ_FIRST( &p->aio_doneq );
672 while ( entryp != NULL ) {
673 aio_workq_entry *next_entryp;
674
675 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
676 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
677 aio_anchor.aio_done_count--;
678 p->aio_done_count--;
679
680 /* we cannot free requests that are still completing */
681 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
682 vm_map_t my_map;
683
684 my_map = entryp->aio_map;
685 entryp->aio_map = VM_MAP_NULL;
686 AIO_UNLOCK;
687 aio_free_request( entryp, my_map );
688
689 /* need to start over since aio_doneq may have been */
690 /* changed while we were away. */
691 AIO_LOCK;
692 entryp = TAILQ_FIRST( &p->aio_doneq );
693 continue;
694 }
695 else
696 /* tell completion code to free this request */
697 entryp->flags |= AIO_DO_FREE;
698 entryp = next_entryp;
699 }
700 AIO_UNLOCK;
701
702ExitRoutine:
703 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
704 (int)p, 0, 0, 0, 0 );
705
706 return;
707
708} /* _aio_exit */
709
710
711/*
712 * do_aio_cancel - cancel async IO requests (if possible). We get called by
713 * aio_cancel, close, and at exit.
714 * There are three modes of operation: 1) cancel all async IOs for a process -
715 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
716 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
717 * aiocbp.
718 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
719 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
720 * target async IO requests, and AIO_ALLDONE if all target async IO requests
721 * were already complete.
722 * WARNING - do not deference aiocbp in this routine, it may point to user
723 * land data that has not been copied in (when called from aio_cancel() )
724 * NOTE - kernel funnel lock is held when we get called.
725 */
726
727static int
728do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp,
729 boolean_t wait_for_completion, boolean_t disable_notification )
730{
731 aio_workq_entry *entryp;
732 int result;
733
734 result = -1;
735
736 /* look for a match on our queue of async todo work. */
737 AIO_LOCK;
738 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
739 while ( entryp != NULL ) {
740 aio_workq_entry *next_entryp;
741
742 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
743 if ( p == entryp->procp ) {
744 if ( (aiocbp == NULL && fd == 0) ||
745 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
746 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
747 /* we found a match so we remove the entry from the */
748 /* todo work queue and place it on the done queue */
749 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
750 aio_anchor.aio_async_workq_count--;
751 entryp->errorval = ECANCELED;
752 entryp->returnval = -1;
753 if ( disable_notification )
754 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
755 result = AIO_CANCELED;
756
757 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
758 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
759
760 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
761 aio_anchor.aio_done_count++;
762 p->aio_done_count++;
763 entryp->flags |= AIO_COMPLETION;
764 AIO_UNLOCK;
765
766 /* do completion processing for this request */
767 do_aio_completion( entryp );
768
769 AIO_LOCK;
770 entryp->flags &= ~AIO_COMPLETION;
771 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
772 vm_map_t my_map;
773
774 my_map = entryp->aio_map;
775 entryp->aio_map = VM_MAP_NULL;
776 AIO_UNLOCK;
777 aio_free_request( entryp, my_map );
778 }
779 else
780 AIO_UNLOCK;
781
782 if ( aiocbp != NULL ) {
783 return( result );
784 }
785
786 /* need to start over since aio_async_workq may have been */
787 /* changed while we were away doing completion processing. */
788 AIO_LOCK;
789 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
790 continue;
791 }
792 }
793 entryp = next_entryp;
794 } /* while... */
795
796 /*
797 * look for a match on our queue of synchronous todo work. This will
798 * be a rare occurrence but could happen if a process is terminated while
799 * processing a lio_listio call.
800 */
801 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
802 while ( entryp != NULL ) {
803 aio_workq_entry *next_entryp;
804
805 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
806 if ( p == entryp->procp ) {
807 if ( (aiocbp == NULL && fd == 0) ||
808 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
809 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
810 /* we found a match so we remove the entry from the */
811 /* todo work queue and place it on the done queue */
812 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
813 aio_anchor.lio_sync_workq_count--;
814 entryp->errorval = ECANCELED;
815 entryp->returnval = -1;
816 if ( disable_notification )
817 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
818 result = AIO_CANCELED;
819
820 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
821 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
822
823 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
824 aio_anchor.aio_done_count++;
825 p->aio_done_count++;
826 if ( aiocbp != NULL ) {
827 AIO_UNLOCK;
828 return( result );
829 }
830 }
831 }
832 entryp = next_entryp;
833 } /* while... */
834
835 /*
836 * look for a match on our queue of active async IO requests and
837 * return AIO_NOTCANCELED result.
838 */
839 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
840 if ( (aiocbp == NULL && fd == 0) ||
841 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
842 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
843 result = AIO_NOTCANCELED;
844
845 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
846 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
847
848 if ( wait_for_completion )
849 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
850 if ( disable_notification )
851 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
852 if ( aiocbp != NULL ) {
853 AIO_UNLOCK;
854 return( result );
855 }
856 }
857 }
858
859 /*
860 * if we didn't find any matches on the todo or active queues then look for a
861 * match on our queue of async IO requests that have completed and if found
862 * return AIO_ALLDONE result.
863 */
864 if ( result == -1 ) {
865 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
866 if ( (aiocbp == NULL && fd == 0) ||
867 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
868 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
869 result = AIO_ALLDONE;
870
871 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
872 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
873
874 if ( aiocbp != NULL ) {
875 AIO_UNLOCK;
876 return( result );
877 }
878 }
879 }
880 }
881 AIO_UNLOCK;
882
883 return( result );
884
885} /* do_aio_cancel */
886
887
888/*
889 * aio_suspend - suspend the calling thread until at least one of the async
890 * IO operations referenced by uap->aiocblist has completed, until a signal
891 * interrupts the function, or uap->timeoutp time interval (optional) has
892 * passed.
893 * Returns 0 if one or more async IOs have completed else -1 and errno is
894 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
895 * woke us up.
896 */
897
898int
899aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
900{
901 int error;
902 int i, count;
903 uint64_t abstime;
904 struct timespec ts;
905 struct timeval tv;
906 aio_workq_entry *entryp;
907 struct aiocb * *aiocbpp;
908
909 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
910 (int)p, uap->nent, 0, 0, 0 );
911
912 *retval = -1;
913 abstime = 0;
914 aiocbpp = NULL;
915
916 /* quick check to see if there are any async IO requests queued up */
917 AIO_LOCK;
918 count = aio_get_all_queues_count( );
919 AIO_UNLOCK;
920 if ( count < 1 ) {
921 error = EINVAL;
922 goto ExitThisRoutine;
923 }
924
925 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
926 error = EINVAL;
927 goto ExitThisRoutine;
928 }
929
930 if ( uap->timeoutp != NULL ) {
931 error = copyin( (void *)uap->timeoutp, &ts, sizeof(ts) );
932 if ( error != 0 ) {
933 error = EAGAIN;
934 goto ExitThisRoutine;
935 }
936
937 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
938 error = EINVAL;
939 goto ExitThisRoutine;
940 }
941
942 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
943 &abstime );
944 clock_absolutetime_interval_to_deadline( abstime, &abstime );
945 }
946
947 MALLOC( aiocbpp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
948 if ( aiocbpp == NULL ) {
949 error = EAGAIN;
950 goto ExitThisRoutine;
951 }
952
953 /* check list of aio requests to see if any have completed */
954 for ( i = 0; i < uap->nent; i++ ) {
955 struct aiocb *aiocbp;
956
957 /* copyin in aiocb pointer from list */
958 error = copyin( (void *)(uap->aiocblist + i), (aiocbpp + i), sizeof(aiocbp) );
959 if ( error != 0 ) {
960 error = EAGAIN;
961 goto ExitThisRoutine;
962 }
963
964 /* NULL elements are legal so check for 'em */
965 aiocbp = *(aiocbpp + i);
966 if ( aiocbp == NULL )
967 continue;
968
969 /* return immediately if any aio request in the list is done */
970 AIO_LOCK;
971 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
972 if ( entryp->uaiocbp == aiocbp ) {
973 *retval = 0;
974 error = 0;
975 AIO_UNLOCK;
976 goto ExitThisRoutine;
977 }
978 }
979 AIO_UNLOCK;
980 } /* for ( ; i < uap->nent; ) */
981
982 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
983 (int)p, uap->nent, 0, 0, 0 );
984
985 /*
986 * wait for an async IO to complete or a signal fires or timeout expires.
987 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
988 * interrupts us. If an async IO completes before a signal fires or our
989 * timeout expires, we get a wakeup call from aio_work_thread(). We do not
990 * use tsleep() here in order to avoid getting kernel funnel lock.
991 */
992 assert_wait( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE );
993 if ( abstime > 0 ) {
994 thread_set_timer_deadline( abstime );
995 }
996 error = thread_block( THREAD_CONTINUE_NULL );
997 if ( error == THREAD_AWAKENED ) {
998 /* got our wakeup call from aio_work_thread() */
999 if ( abstime > 0 ) {
1000 thread_cancel_timer();
1001 }
1002 *retval = 0;
1003 error = 0;
1004 }
1005 else if ( error == THREAD_TIMED_OUT ) {
1006 /* our timeout expired */
1007 error = EAGAIN;
1008 }
1009 else {
1010 /* we were interrupted */
1011 if ( abstime > 0 ) {
1012 thread_cancel_timer();
1013 }
1014 error = EINTR;
1015 }
1016
1017ExitThisRoutine:
1018 if ( aiocbpp != NULL )
1019 FREE( aiocbpp, M_TEMP );
1020
1021 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1022 (int)p, uap->nent, error, 0, 0 );
1023
1024 return( error );
1025
1026} /* aio_suspend */
1027
1028
1029/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1030 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1031 * (uap->aiocbp->aio_buf).
1032 */
1033
1034int
1035aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1036{
1037 int error;
1038
1039 *retval = 0;
1040
1041 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1042 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1043
1044 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1045 if ( error != 0 )
1046 *retval = -1;
1047
1048 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1049 (int)p, (int)uap->aiocbp, error, 0, 0 );
1050
1051 return( error );
1052
1053} /* aio_write */
1054
1055
1056/*
1057 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1058 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1059 * The caller gets error and return status for each aiocb in the list via aio_error
1060 * and aio_return. We must keep completed requests until released by the
1061 * aio_return call.
1062 */
1063
1064int
1065lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1066{
1067 int i;
1068 int call_result;
1069 int result;
1070 long group_tag;
1071 aio_workq_entry * *entryp_listp;
1072
1073 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1074 (int)p, uap->nent, uap->mode, 0, 0 );
1075
1076 entryp_listp = NULL;
1077 call_result = -1;
1078 *retval = -1;
1079 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1080 call_result = EINVAL;
1081 goto ExitRoutine;
1082 }
1083
1084 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1085 call_result = EINVAL;
1086 goto ExitRoutine;
1087 }
1088
1089 /*
1090 * we use group_tag to mark IO requests for delayed completion processing
1091 * which means we wait until all IO requests in the group have completed
1092 * before we either return to the caller when mode is LIO_WAIT or signal
1093 * user when mode is LIO_NOWAIT.
1094 */
1095 group_tag = random();
1096
1097 /*
1098 * allocate a list of aio_workq_entry pointers that we will use to queue
1099 * up all our requests at once while holding our lock.
1100 */
1101 MALLOC( entryp_listp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
1102 if ( entryp_listp == NULL ) {
1103 call_result = EAGAIN;
1104 goto ExitRoutine;
1105 }
1106
1107 /* process list of aio requests */
1108 for ( i = 0; i < uap->nent; i++ ) {
1109 struct aiocb *my_aiocbp;
1110
1111 *(entryp_listp + i) = NULL;
1112
1113 /* copyin in aiocb pointer from list */
1114 result = copyin( (void *)(uap->aiocblist + i), &my_aiocbp, sizeof(my_aiocbp) );
1115 if ( result != 0 ) {
1116 call_result = EAGAIN;
1117 continue;
1118 }
1119
1120 /* NULL elements are legal so check for 'em */
1121 if ( my_aiocbp == NULL )
1122 continue;
1123
1124 if ( uap->mode == LIO_NOWAIT )
1125 result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1126 group_tag, (entryp_listp + i) );
1127 else
1128 result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1129 (entryp_listp + i) );
1130
1131 if ( result != 0 && call_result == -1 )
1132 call_result = result;
1133 }
1134
1135 /*
1136 * we need to protect this section since we do not want any of these grouped
1137 * IO requests to begin until we have them all on the queue.
1138 */
1139 AIO_LOCK;
1140 for ( i = 0; i < uap->nent; i++ ) {
1141 aio_workq_entry *entryp;
1142
1143 /* NULL elements are legal so check for 'em */
1144 entryp = *(entryp_listp + i);
1145 if ( entryp == NULL )
1146 continue;
1147
1148 /* check our aio limits to throttle bad or rude user land behavior */
1149 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1150 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1151 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1152 vm_map_t my_map;
1153
1154 my_map = entryp->aio_map;
1155 entryp->aio_map = VM_MAP_NULL;
1156 result = EAGAIN;
1157 AIO_UNLOCK;
1158 aio_free_request( entryp, my_map );
1159 AIO_LOCK;
1160 continue;
1161 }
1162
1163 /* place the request on the appropriate queue */
1164 if ( uap->mode == LIO_NOWAIT ) {
1165 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1166 aio_anchor.aio_async_workq_count++;
1167
1168 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1169 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1170 }
1171 else {
1172 TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1173 aio_anchor.lio_sync_workq_count++;
1174 }
1175 }
1176 AIO_UNLOCK;
1177
1178 if ( uap->mode == LIO_NOWAIT )
1179 /* caller does not want to wait so we'll fire off a worker thread and return */
1180 wakeup_one( &aio_anchor.aio_async_workq );
1181 else {
1182 aio_workq_entry *entryp;
1183 int error;
1184
1185 /*
1186 * mode is LIO_WAIT - handle the IO requests now.
1187 */
1188 AIO_LOCK;
1189 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1190 while ( entryp != NULL ) {
1191 if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1192 boolean_t funnel_state;
1193
1194 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1195 aio_anchor.lio_sync_workq_count--;
1196 AIO_UNLOCK;
1197
1198 // file system IO code path requires kernel funnel lock
1199 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1200 if ( (entryp->flags & AIO_READ) != 0 ) {
1201 error = do_aio_read( entryp );
1202 }
1203 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1204 error = do_aio_write( entryp );
1205 }
1206 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1207 error = do_aio_fsync( entryp );
1208 }
1209 else {
1210 printf( "%s - unknown aio request - flags 0x%02X \n",
1211 __FUNCTION__, entryp->flags );
1212 error = EINVAL;
1213 }
1214 entryp->errorval = error;
1215 if ( error != 0 && call_result == -1 )
1216 call_result = EIO;
1217 (void) thread_funnel_set( kernel_flock, funnel_state );
1218
1219 AIO_LOCK;
1220 /* we're done with the IO request so move it on the done queue */
1221 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1222 aio_anchor.aio_done_count++;
1223 p->aio_done_count++;
1224
1225 /* need to start over since lio_sync_workq may have been changed while we */
1226 /* were away doing the IO. */
1227 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1228 continue;
1229 } /* p == entryp->procp */
1230
1231 entryp = TAILQ_NEXT( entryp, aio_workq_link );
1232 } /* while ( entryp != NULL ) */
1233 AIO_UNLOCK;
1234 } /* uap->mode == LIO_WAIT */
1235
1236 /* call_result == -1 means we had no trouble queueing up requests */
1237 if ( call_result == -1 ) {
1238 call_result = 0;
1239 *retval = 0;
1240 }
1241
1242ExitRoutine:
1243 if ( entryp_listp != NULL )
1244 FREE( entryp_listp, M_TEMP );
1245
1246 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1247 (int)p, call_result, 0, 0, 0 );
1248
1249 return( call_result );
1250
1251} /* lio_listio */
1252
1253
1254/*
1255 * aio worker thread. this is where all the real work gets done.
1256 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1257 * after new work is queued up.
1258 */
1259
1260static void
1261aio_work_thread( void )
1262{
1263 aio_workq_entry *entryp;
1264 struct uthread *uthread = (struct uthread *)get_bsdthread_info(current_act());
1265
1266 for( ;; ) {
1267 entryp = aio_get_some_work();
1268 if ( entryp == NULL ) {
1269 /*
1270 * aio worker threads wait for some work to get queued up
1271 * by aio_queue_async_request. Once some work gets queued
1272 * it will wake up one of these worker threads just before
1273 * returning to our caller in user land. We do not use
1274 * tsleep() here in order to avoid getting kernel funnel lock.
1275 */
1276 assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1277 thread_block( THREAD_CONTINUE_NULL );
1278
1279 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_wake)) | DBG_FUNC_NONE,
1280 0, 0, 0, 0, 0 );
1281 }
1282 else {
1283 int error;
1284 boolean_t funnel_state;
1285 vm_map_t currentmap;
1286 vm_map_t oldmap = VM_MAP_NULL;
1287 task_t oldaiotask = TASK_NULL;
1288
1289 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1290 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1291
1292 /*
1293 * Assume the target's address space identity for the duration
1294 * of the IO.
1295 */
1296 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1297
1298 currentmap = get_task_map( (current_proc())->task );
1299 if ( currentmap != entryp->aio_map ) {
1300 oldaiotask = uthread->uu_aio_task;
1301 uthread->uu_aio_task = entryp->procp->task;
1302 oldmap = vm_map_switch( entryp->aio_map );
1303 }
1304
1305 if ( (entryp->flags & AIO_READ) != 0 ) {
1306 error = do_aio_read( entryp );
1307 }
1308 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1309 error = do_aio_write( entryp );
1310 }
1311 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1312 error = do_aio_fsync( entryp );
1313 }
1314 else {
1315 printf( "%s - unknown aio request - flags 0x%02X \n",
1316 __FUNCTION__, entryp->flags );
1317 error = EINVAL;
1318 }
1319 entryp->errorval = error;
1320 if ( currentmap != entryp->aio_map ) {
1321 (void) vm_map_switch( oldmap );
1322 uthread->uu_aio_task = oldaiotask;
1323 }
1324
1325 /* we're done with the IO request so pop it off the active queue and */
1326 /* push it on the done queue */
1327 AIO_LOCK;
1328 TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1329 aio_anchor.aio_active_count--;
1330 entryp->procp->aio_active_count--;
1331 TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1332 aio_anchor.aio_done_count++;
1333 entryp->procp->aio_done_count++;
1334 entryp->flags |= AIO_COMPLETION;
1335
1336 /* remove our reference to the user land map. */
1337 if ( VM_MAP_NULL != entryp->aio_map ) {
1338 vm_map_t my_map;
1339
1340 my_map = entryp->aio_map;
1341 entryp->aio_map = VM_MAP_NULL;
1342 AIO_UNLOCK; /* must unlock before calling vm_map_deallocate() */
1343 vm_map_deallocate( my_map );
1344 }
1345 else {
1346 AIO_UNLOCK;
1347 }
1348
1349 do_aio_completion( entryp );
1350 (void) thread_funnel_set( kernel_flock, funnel_state );
1351
1352 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1353 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1354 entryp->returnval, 0 );
1355
1356 AIO_LOCK;
1357 entryp->flags &= ~AIO_COMPLETION;
1358 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1359 vm_map_t my_map;
1360
1361 my_map = entryp->aio_map;
1362 entryp->aio_map = VM_MAP_NULL;
1363 AIO_UNLOCK;
1364 aio_free_request( entryp, my_map );
1365 }
1366 else
1367 AIO_UNLOCK;
1368 }
1369 } /* for ( ;; ) */
1370
1371 /* NOT REACHED */
1372
1373} /* aio_work_thread */
1374
1375
1376/*
1377 * aio_get_some_work - get the next async IO request that is ready to be executed.
1378 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1379 * IO requests at the time the aio_fsync call came in have completed.
1380 */
1381
1382static aio_workq_entry *
1383aio_get_some_work( void )
1384{
1385 aio_workq_entry *entryp;
1386 int skip_count = 0;
1387
1388 /* pop some work off the work queue and add to our active queue */
1389 AIO_LOCK;
1390 for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1391 entryp != NULL;
1392 entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1393
1394 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1395 /* leave aio_fsync calls on the work queue if there are IO */
1396 /* requests on the active queue for the same file descriptor. */
1397 if ( aio_delay_fsync_request( entryp ) ) {
1398
1399 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1400 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1401 continue;
1402 }
1403 }
1404 break;
1405 }
1406
1407 if ( entryp != NULL ) {
1408 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1409 aio_anchor.aio_async_workq_count--;
1410 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1411 aio_anchor.aio_active_count++;
1412 entryp->procp->aio_active_count++;
1413 }
1414 AIO_UNLOCK;
1415
1416 return( entryp );
1417
1418} /* aio_get_some_work */
1419
1420
1421/*
1422 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1423 * this time. Delay will happen when there are any active IOs for the same file
1424 * descriptor that were queued at time the aio_sync call was queued.
1425 * NOTE - AIO_LOCK must be held by caller
1426 */
1427static boolean_t
1428aio_delay_fsync_request( aio_workq_entry *entryp )
1429{
1430 aio_workq_entry *my_entryp;
1431
1432 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1433 if ( my_entryp->fsyncp != NULL &&
1434 entryp->uaiocbp == my_entryp->fsyncp &&
1435 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1436 return( TRUE );
1437 }
1438 }
1439
1440 return( FALSE );
1441
1442} /* aio_delay_fsync_request */
1443
1444
1445/*
1446 * aio_queue_async_request - queue up an async IO request on our work queue then
1447 * wake up one of our worker threads to do the actual work. We get a reference
1448 * to our caller's user land map in order to keep it around while we are
1449 * processing the request.
1450 */
1451
1452static int
1453aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO )
1454{
1455 aio_workq_entry *entryp;
1456 int result;
1457
1458 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1459 if ( entryp == NULL ) {
1460 result = EAGAIN;
1461 goto error_exit;
1462 }
1463 bzero( entryp, sizeof(*entryp) );
1464
1465 /* fill in the rest of the aio_workq_entry */
1466 entryp->procp = procp;
1467 entryp->uaiocbp = aiocbp;
1468 entryp->flags |= kindOfIO;
1469 entryp->aio_map = VM_MAP_NULL;
1470 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1471 if ( result != 0 ) {
1472 result = EAGAIN;
1473 goto error_exit;
1474 }
1475
1476 /* do some more validation on the aiocb and embedded file descriptor */
1477 result = aio_validate( entryp );
1478 if ( result != 0 )
1479 goto error_exit;
1480
1481 /* get a reference to the user land map in order to keep it around */
1482 entryp->aio_map = get_task_map( procp->task );
1483 vm_map_reference( entryp->aio_map );
1484
1485 AIO_LOCK;
1486
1487 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1488 AIO_UNLOCK;
1489 result = EAGAIN;
1490 goto error_exit;
1491 }
1492
1493 /* check our aio limits to throttle bad or rude user land behavior */
1494 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1495 aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1496 AIO_UNLOCK;
1497 result = EAGAIN;
1498 goto error_exit;
1499 }
1500
1501 /*
1502 * aio_fsync calls sync up all async IO requests queued at the time
1503 * the aio_fsync call was made. So we mark each currently queued async
1504 * IO with a matching file descriptor as must complete before we do the
1505 * fsync. We set the fsyncp field of each matching async IO
1506 * request with the aiocb pointer passed in on the aio_fsync call to
1507 * know which IOs must complete before we process the aio_fsync call.
1508 */
1509 if ( (kindOfIO & AIO_FSYNC) != 0 )
1510 aio_mark_requests( entryp );
1511
1512 /* queue up on our aio asynchronous work queue */
1513 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1514 aio_anchor.aio_async_workq_count++;
1515
1516 AIO_UNLOCK;
1517
1518 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1519 (int)procp, (int)aiocbp, 0, 0, 0 );
1520
1521 wakeup_one( &aio_anchor.aio_async_workq );
1522
1523 return( 0 );
1524
1525error_exit:
1526 if ( entryp != NULL ) {
1527 /* this entry has not been queued up so no worries about unlocked */
1528 /* state and aio_map */
1529 aio_free_request( entryp, entryp->aio_map );
1530 }
1531
1532 return( result );
1533
1534} /* aio_queue_async_request */
1535
1536
1537/*
1538 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1539 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1540 * our caller. We get a reference to our caller's user land map in order to keep
1541 * it around while we are processing the request.
1542 * lio_listio calls behave differently at completion they do completion notification
1543 * when all async IO requests have completed. We use group_tag to tag IO requests
1544 * that behave in the delay notification manner.
1545 */
1546
1547static int
1548lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp,
1549 struct sigevent *sigp, long group_tag,
1550 aio_workq_entry **entrypp )
1551{
1552 aio_workq_entry *entryp;
1553 int result;
1554
1555 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1556 if ( entryp == NULL ) {
1557 result = EAGAIN;
1558 goto error_exit;
1559 }
1560 bzero( entryp, sizeof(*entryp) );
1561
1562 /* fill in the rest of the aio_workq_entry */
1563 entryp->procp = procp;
1564 entryp->uaiocbp = aiocbp;
1565 entryp->flags |= AIO_LIO;
1566 entryp->group_tag = group_tag;
1567 entryp->aio_map = VM_MAP_NULL;
1568 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1569 if ( result != 0 ) {
1570 result = EAGAIN;
1571 goto error_exit;
1572 }
1573
1574 /* look for lio_listio LIO_NOP requests and ignore them. */
1575 /* Not really an error, but we need to free our aio_workq_entry. */
1576 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1577 result = 0;
1578 goto error_exit;
1579 }
1580
1581 /* use sigevent passed in to lio_listio for each of our calls, but only */
1582 /* do completion notification after the last request completes. */
1583 if ( sigp != NULL ) {
1584 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1585 if ( result != 0 ) {
1586 result = EAGAIN;
1587 goto error_exit;
1588 }
1589 }
1590
1591 /* do some more validation on the aiocb and embedded file descriptor */
1592 result = aio_validate( entryp );
1593 if ( result != 0 )
1594 goto error_exit;
1595
1596 /* get a reference to the user land map in order to keep it around */
1597 entryp->aio_map = get_task_map( procp->task );
1598 vm_map_reference( entryp->aio_map );
1599
1600 *entrypp = entryp;
1601 return( 0 );
1602
1603error_exit:
1604 if ( entryp != NULL )
1605 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1606
1607 return( result );
1608
1609} /* lio_create_async_entry */
1610
1611
1612/*
1613 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1614 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1615 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1616 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1617 * NOTE - AIO_LOCK must be held by caller
1618 */
1619
1620static void
1621aio_mark_requests( aio_workq_entry *entryp )
1622{
1623 aio_workq_entry *my_entryp;
1624
1625 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1626 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1627 my_entryp->fsyncp = entryp->uaiocbp;
1628 }
1629 }
1630
1631 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1632 if ( entryp->procp == my_entryp->procp &&
1633 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1634 my_entryp->fsyncp = entryp->uaiocbp;
1635 }
1636 }
1637
1638} /* aio_mark_requests */
1639
1640
1641/*
1642 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1643 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1644 * our caller.
1645 * lio_listio calls behave differently at completion they do completion notification
1646 * when all async IO requests have completed. We use group_tag to tag IO requests
1647 * that behave in the delay notification manner.
1648 */
1649
1650static int
1651lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp,
1652 long group_tag, aio_workq_entry **entrypp )
1653{
1654 aio_workq_entry *entryp;
1655 int result;
1656
1657 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1658 if ( entryp == NULL ) {
1659 result = EAGAIN;
1660 goto error_exit;
1661 }
1662 bzero( entryp, sizeof(*entryp) );
1663
1664 /* fill in the rest of the aio_workq_entry */
1665 entryp->procp = procp;
1666 entryp->uaiocbp = aiocbp;
1667 entryp->flags |= AIO_LIO;
1668 entryp->group_tag = group_tag;
1669 entryp->aio_map = VM_MAP_NULL;
1670 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1671 if ( result != 0 ) {
1672 result = EAGAIN;
1673 goto error_exit;
1674 }
1675
1676 /* look for lio_listio LIO_NOP requests and ignore them. */
1677 /* Not really an error, but we need to free our aio_workq_entry. */
1678 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1679 result = 0;
1680 goto error_exit;
1681 }
1682
1683 result = aio_validate( entryp );
1684 if ( result != 0 ) {
1685 goto error_exit;
1686 }
1687
1688 *entrypp = entryp;
1689 return( 0 );
1690
1691error_exit:
1692 if ( entryp != NULL )
1693 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1694
1695 return( result );
1696
1697} /* lio_create_sync_entry */
1698
1699
1700/*
1701 * aio_free_request - remove our reference on the user land map and
1702 * free the work queue entry resources.
1703 * We are not holding the lock here thus aio_map is passed in and
1704 * zeroed while we did have the lock.
1705 */
1706
1707static int
1708aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1709{
1710 /* remove our reference to the user land map. */
1711 if ( VM_MAP_NULL != the_map ) {
1712 vm_map_deallocate( the_map );
1713 }
1714
1715 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1716
1717 return( 0 );
1718
1719} /* aio_free_request */
1720
1721
1722/* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1723 */
1724
1725static int
1726aio_validate( aio_workq_entry *entryp )
1727{
1728 boolean_t funnel_state;
1729 struct file *fp;
1730 int flag;
1731 int result;
1732
1733 result = 0;
1734
1735 if ( (entryp->flags & AIO_LIO) != 0 ) {
1736 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1737 entryp->flags |= AIO_READ;
1738 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1739 entryp->flags |= AIO_WRITE;
1740 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1741 return( 0 );
1742 else
1743 return( EINVAL );
1744 }
1745
1746 flag = FREAD;
1747 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1748 flag = FWRITE;
1749 }
1750
1751 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1752 if ( entryp->aiocb.aio_offset < 0 ||
1753 entryp->aiocb.aio_nbytes < 0 ||
1754 entryp->aiocb.aio_nbytes > INT_MAX ||
1755 entryp->aiocb.aio_buf == NULL )
1756 return( EINVAL );
1757 }
1758
1759 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1760 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1761 * sigev_notify_function, and sigev_notify_attributes are ignored.
1762 */
1763 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1764 int signum;
1765 /* make sure we have a valid signal number */
1766 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1767 if ( signum <= 0 || signum >= NSIG ||
1768 signum == SIGKILL || signum == SIGSTOP )
1769 return (EINVAL);
1770 }
1771 else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1772 return (EINVAL);
1773
1774 /* validate the file descriptor and that the file was opened
1775 * for the appropriate read / write access. This section requires
1776 * kernel funnel lock.
1777 */
1778 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1779
1780 result = fdgetf( entryp->procp, entryp->aiocb.aio_fildes, &fp );
1781 if ( result == 0 ) {
1782 if ( (fp->f_flag & flag) == 0 ) {
1783 /* we don't have read or write access */
1784 result = EBADF;
1785 }
1786 else if ( fp->f_type != DTYPE_VNODE ) {
1787 /* this is not a file */
1788 result = ESPIPE;
1789 }
1790 }
1791 else {
1792 result = EBADF;
1793 }
1794
1795 (void) thread_funnel_set( kernel_flock, funnel_state );
1796
1797 return( result );
1798
1799} /* aio_validate */
1800
1801
1802/*
1803 * aio_get_process_count - runs through our queues that hold outstanding
1804 * async IO reqests and totals up number of requests for the given
1805 * process.
1806 * NOTE - caller must hold aio lock!
1807 */
1808
1809static int
1810aio_get_process_count( struct proc *procp )
1811{
1812 aio_workq_entry *entryp;
1813 int error;
1814 int count;
1815
1816 /* begin with count of completed async IO requests for this process */
1817 count = procp->aio_done_count;
1818
1819 /* add in count of active async IO requests for this process */
1820 count += procp->aio_active_count;
1821
1822 /* look for matches on our queue of asynchronous todo work */
1823 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1824 if ( procp == entryp->procp ) {
1825 count++;
1826 }
1827 }
1828
1829 /* look for matches on our queue of synchronous todo work */
1830 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1831 if ( procp == entryp->procp ) {
1832 count++;
1833 }
1834 }
1835
1836 return( count );
1837
1838} /* aio_get_process_count */
1839
1840
1841/*
1842 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1843 * NOTE - caller must hold aio lock!
1844 */
1845
1846static int
1847aio_get_all_queues_count( void )
1848{
1849 int count;
1850
1851 count = aio_anchor.aio_async_workq_count;
1852 count += aio_anchor.lio_sync_workq_count;
1853 count += aio_anchor.aio_active_count;
1854 count += aio_anchor.aio_done_count;
1855
1856 return( count );
1857
1858} /* aio_get_all_queues_count */
1859
1860
1861/*
1862 * do_aio_completion. Handle async IO completion.
1863 */
1864
1865static void
1866do_aio_completion( aio_workq_entry *entryp )
1867{
1868 /* signal user land process if appropriate */
1869 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1870 (entryp->flags & AIO_DISABLE) == 0 ) {
1871
1872 /*
1873 * if group_tag is non zero then make sure this is the last IO request
1874 * in the group before we signal.
1875 */
1876 if ( entryp->group_tag == 0 ||
1877 (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1878 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1879 (int)entryp->procp, (int)entryp->uaiocbp,
1880 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1881
1882 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1883 return;
1884 }
1885 }
1886
1887 /*
1888 * need to handle case where a process is trying to exit, exec, or close
1889 * and is currently waiting for active aio requests to complete. If
1890 * AIO_WAITING is set then we need to look to see if there are any
1891 * other requests in the active queue for this process. If there are
1892 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1893 * there are some still active then do nothing - we only want to wakeup
1894 * when all active aio requests for the process are complete.
1895 */
1896 if ( (entryp->flags & AIO_WAITING) != 0 ) {
1897 int active_requests;
1898
1899 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1900 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1901
1902 AIO_LOCK;
1903 active_requests = aio_active_requests_for_process( entryp->procp );
1904 AIO_UNLOCK;
1905 if ( active_requests < 1 ) {
1906 /* no active aio requests for this process, continue exiting */
1907
1908 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1909 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1910
1911 wakeup_one( &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1912 }
1913 return;
1914 }
1915
1916 /*
1917 * aio_suspend case when a signal was not requested. In that scenario we
1918 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1919 * NOTE - the assumption here is that this wakeup call is inexpensive.
1920 * we really only need to do this when an aio_suspend call is pending.
1921 * If we find the wakeup call should be avoided we could mark the
1922 * async IO requests given in the list provided by aio_suspend and only
1923 * call wakeup for them. If we do mark them we should unmark them after
1924 * the aio_suspend wakes up.
1925 */
1926 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1927 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1928
1929 wakeup_one( &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1930
1931 return;
1932
1933} /* do_aio_completion */
1934
1935
1936/*
1937 * aio_last_group_io - checks to see if this is the last unfinished IO request
1938 * for the given group_tag. Returns TRUE if there are no other active IO
1939 * requests for this group or FALSE if the are active IO requests
1940 * NOTE - AIO_LOCK must be held by caller
1941 */
1942
1943static boolean_t
1944aio_last_group_io( aio_workq_entry *entryp )
1945{
1946 aio_workq_entry *my_entryp;
1947
1948 /* look for matches on our queue of active async IO requests */
1949 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1950 if ( my_entryp->group_tag == entryp->group_tag )
1951 return( FALSE );
1952 }
1953
1954 /* look for matches on our queue of asynchronous todo work */
1955 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1956 if ( my_entryp->group_tag == entryp->group_tag )
1957 return( FALSE );
1958 }
1959
1960 /* look for matches on our queue of synchronous todo work */
1961 TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1962 if ( my_entryp->group_tag == entryp->group_tag )
1963 return( FALSE );
1964 }
1965
1966 return( TRUE );
1967
1968} /* aio_last_group_io */
1969
1970
1971/*
1972 * do_aio_read
1973 */
1974static int
1975do_aio_read( aio_workq_entry *entryp )
1976{
1977 struct file *fp;
1978 int error;
1979
1980 fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FREAD );
1981 if ( fp != NULL ) {
1982 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
1983 (void *)entryp->aiocb.aio_buf,
1984 entryp->aiocb.aio_nbytes,
1985 entryp->aiocb.aio_offset, FOF_OFFSET,
1986 &entryp->returnval );
1987 frele( fp );
1988 }
1989 else
1990 error = EBADF;
1991
1992 return( error );
1993
1994} /* do_aio_read */
1995
1996
1997/*
1998 * do_aio_write
1999 */
2000static int
2001do_aio_write( aio_workq_entry *entryp )
2002{
2003 struct file *fp;
2004 int error;
2005
2006 fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FWRITE );
2007 if ( fp != NULL ) {
2008 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes,
2009 (const void *)entryp->aiocb.aio_buf,
2010 entryp->aiocb.aio_nbytes,
2011 entryp->aiocb.aio_offset, FOF_OFFSET,
2012 &entryp->returnval );
2013 frele( fp );
2014 }
2015 else
2016 error = EBADF;
2017
2018 return( error );
2019
2020} /* do_aio_write */
2021
2022
2023/*
2024 * aio_active_requests_for_process - return number of active async IO
2025 * requests for the given process.
2026 * NOTE - caller must hold aio lock!
2027 */
2028
2029static int
2030aio_active_requests_for_process( struct proc *procp )
2031{
2032
2033 return( procp->aio_active_count );
2034
2035} /* aio_active_requests_for_process */
2036
2037
2038/*
2039 * do_aio_fsync
2040 */
2041static int
2042do_aio_fsync( aio_workq_entry *entryp )
2043{
2044 register struct vnode *vp;
2045 struct file *fp;
2046 int error;
2047
2048 /*
2049 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2050 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2051 * The following was shamelessly extracted from fsync() implementation.
2052 */
2053 error = getvnode( entryp->procp, entryp->aiocb.aio_fildes, &fp );
2054 if ( error == 0 ) {
2055 vp = (struct vnode *)fp->f_data;
2056 vn_lock( vp, LK_EXCLUSIVE | LK_RETRY, entryp->procp );
2057 error = VOP_FSYNC( vp, fp->f_cred, MNT_WAIT, entryp->procp );
2058 VOP_UNLOCK( vp, 0, entryp->procp );
2059 }
2060 if ( error != 0 )
2061 entryp->returnval = -1;
2062
2063 return( error );
2064
2065} /* do_aio_fsync */
2066
2067
2068/*
2069 * is_already_queued - runs through our queues to see if the given
2070 * aiocbp / process is there. Returns TRUE if there is a match
2071 * on any of our aio queues.
2072 * NOTE - callers must hold aio lock!
2073 */
2074
2075static boolean_t
2076is_already_queued( struct proc *procp,
2077 struct aiocb *aiocbp )
2078{
2079 aio_workq_entry *entryp;
2080 boolean_t result;
2081
2082 result = FALSE;
2083
2084 /* look for matches on our queue of async IO requests that have completed */
2085 TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2086 if ( aiocbp == entryp->uaiocbp ) {
2087 result = TRUE;
2088 goto ExitThisRoutine;
2089 }
2090 }
2091
2092 /* look for matches on our queue of active async IO requests */
2093 TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2094 if ( aiocbp == entryp->uaiocbp ) {
2095 result = TRUE;
2096 goto ExitThisRoutine;
2097 }
2098 }
2099
2100 /* look for matches on our queue of asynchronous todo work */
2101 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2102 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2103 result = TRUE;
2104 goto ExitThisRoutine;
2105 }
2106 }
2107
2108 /* look for matches on our queue of synchronous todo work */
2109 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2110 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2111 result = TRUE;
2112 goto ExitThisRoutine;
2113 }
2114 }
2115
2116ExitThisRoutine:
2117 return( result );
2118
2119} /* is_already_queued */
2120
2121
2122/*
2123 * aio initialization
2124 */
2125__private_extern__ void
2126aio_init( void )
2127{
2128 int i;
2129
2130 simple_lock_init( &aio_lock );
2131
2132 AIO_LOCK;
2133 TAILQ_INIT( &aio_anchor.aio_async_workq );
2134 TAILQ_INIT( &aio_anchor.lio_sync_workq );
2135 aio_anchor.aio_async_workq_count = 0;
2136 aio_anchor.lio_sync_workq_count = 0;
2137 aio_anchor.aio_active_count = 0;
2138 aio_anchor.aio_done_count = 0;
2139 AIO_UNLOCK;
2140
2141 i = sizeof( aio_workq_entry );
2142 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2143
2144 _aio_create_worker_threads( aio_worker_threads );
2145
2146 return;
2147
2148} /* aio_init */
2149
2150
2151/*
2152 * aio worker threads created here.
2153 */
2154__private_extern__ void
2155_aio_create_worker_threads( int num )
2156{
2157 int i;
2158
2159 /* create some worker threads to handle the async IO requests */
2160 for ( i = 0; i < num; i++ ) {
2161 thread_t myThread;
2162
2163 myThread = kernel_thread( kernel_task, aio_work_thread );
2164 if ( THREAD_NULL == myThread ) {
2165 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2166 }
2167 }
2168
2169 return;
2170
2171} /* _aio_create_worker_threads */
2172
2173/*
2174 * Return the current activation utask
2175 */
2176task_t
2177get_aiotask(void)
2178{
2179 return ((struct uthread *)get_bsdthread_info(current_act()))->uu_aio_task;
2180}