]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_aio.c
d95bda7536b66b7a025f94bff2245a5f9966ad5b
[apple/xnu.git] / bsd / kern / kern_aio.c
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25
26
27 /*
28 * todo:
29 * 1) ramesh is looking into how to replace taking a reference on
30 * the user's map (vm_map_reference()) since it is believed that
31 * would not hold the process for us.
32 * 2) david is looking into a way for us to set the priority of the
33 * worker threads to match that of the user's thread when the
34 * async IO was queued.
35 */
36
37
38 /*
39 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
40 */
41
42 #include <sys/systm.h>
43 #include <sys/buf.h>
44 #include <sys/fcntl.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/kernel.h>
48 #include <sys/vnode.h>
49 #include <sys/malloc.h>
50 #include <sys/mount.h>
51 #include <sys/param.h>
52 #include <sys/proc.h>
53 #include <sys/sysctl.h>
54 #include <sys/unistd.h>
55 #include <sys/user.h>
56
57 #include <sys/aio_kern.h>
58
59 #include <machine/limits.h>
60 #include <kern/zalloc.h>
61 #include <kern/task.h>
62
63 #include <sys/kdebug.h>
64 #define AIO_work_queued 1
65 #define AIO_worker_wake 2
66 #define AIO_completion_sig 3
67 #define AIO_completion_cleanup_wait 4
68 #define AIO_completion_cleanup_wake 5
69 #define AIO_completion_suspend_wake 6
70 #define AIO_fsync_delay 7
71 #define AIO_cancel 10
72 #define AIO_cancel_async_workq 11
73 #define AIO_cancel_sync_workq 12
74 #define AIO_cancel_activeq 13
75 #define AIO_cancel_doneq 14
76 #define AIO_fsync 20
77 #define AIO_read 30
78 #define AIO_write 40
79 #define AIO_listio 50
80 #define AIO_error 60
81 #define AIO_error_val 61
82 #define AIO_error_activeq 62
83 #define AIO_error_workq 63
84 #define AIO_return 70
85 #define AIO_return_val 71
86 #define AIO_return_activeq 72
87 #define AIO_return_workq 73
88 #define AIO_exec 80
89 #define AIO_exit 90
90 #define AIO_exit_sleep 91
91 #define AIO_close 100
92 #define AIO_close_sleep 101
93 #define AIO_suspend 110
94 #define AIO_suspend_sleep 111
95 #define AIO_worker_thread 120
96
97 #if 0
98 #undef KERNEL_DEBUG
99 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
100 #endif
101
102 /*
103 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
104 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
105 * (proc.aio_activeq) when one of our worker threads start the IO.
106 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
107 * when the IO request completes. The request remains on aio_doneq until
108 * user process calls aio_return or the process exits, either way that is our
109 * trigger to release aio resources.
110 */
111 struct aio_anchor_cb
112 {
113 int aio_async_workq_count; /* entries on aio_async_workq */
114 int lio_sync_workq_count; /* entries on lio_sync_workq */
115 int aio_active_count; /* entries on all active queues (proc.aio_activeq) */
116 int aio_done_count; /* entries on all done queues (proc.aio_doneq) */
117 TAILQ_HEAD( , aio_workq_entry ) aio_async_workq;
118 TAILQ_HEAD( , aio_workq_entry ) lio_sync_workq;
119 };
120 typedef struct aio_anchor_cb aio_anchor_cb;
121
122
123 /*
124 * Notes on aio sleep / wake channels.
125 * We currently pick a couple fields within the proc structure that will allow
126 * us sleep channels that currently do not collide with any other kernel routines.
127 * At this time, for binary compatibility reasons, we cannot create new proc fields.
128 */
129 #define AIO_SUSPEND_SLEEP_CHAN p_estcpu
130 #define AIO_CLEANUP_SLEEP_CHAN p_pctcpu
131
132
133 /*
134 * aysnc IO locking macros used to protect critical sections.
135 */
136 #define AIO_LOCK usimple_lock( &aio_lock )
137 #define AIO_UNLOCK usimple_unlock( &aio_lock )
138
139
140 /*
141 * LOCAL PROTOTYPES
142 */
143 static int aio_active_requests_for_process( struct proc *procp );
144 static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
145 static int aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
146 static int aio_get_all_queues_count( void );
147 static int aio_get_process_count( struct proc *procp );
148 static aio_workq_entry * aio_get_some_work( void );
149 static boolean_t aio_last_group_io( aio_workq_entry *entryp );
150 static void aio_mark_requests( aio_workq_entry *entryp );
151 static int aio_queue_async_request( struct proc *procp,
152 struct aiocb *aiocbp,
153 int kindOfIO );
154 static int aio_validate( aio_workq_entry *entryp );
155 static void aio_work_thread( void );
156 static int do_aio_cancel( struct proc *p,
157 int fd,
158 struct aiocb *aiocbp,
159 boolean_t wait_for_completion,
160 boolean_t disable_notification );
161 static void do_aio_completion( aio_workq_entry *entryp );
162 static int do_aio_fsync( aio_workq_entry *entryp );
163 static int do_aio_read( aio_workq_entry *entryp );
164 static int do_aio_write( aio_workq_entry *entryp );
165 static boolean_t is_already_queued( struct proc *procp,
166 struct aiocb *aiocbp );
167 static int lio_create_async_entry( struct proc *procp,
168 struct aiocb *aiocbp,
169 struct sigevent *sigp,
170 long group_tag,
171 aio_workq_entry **entrypp );
172 static int lio_create_sync_entry( struct proc *procp,
173 struct aiocb *aiocbp,
174 long group_tag,
175 aio_workq_entry **entrypp );
176
177 /*
178 * EXTERNAL PROTOTYPES
179 */
180
181 /* in ...bsd/kern/sys_generic.c */
182 extern struct file* holdfp( struct filedesc* fdp, int fd, int flag );
183 extern int dofileread( struct proc *p, struct file *fp, int fd,
184 void *buf, size_t nbyte, off_t offset,
185 int flags, int *retval );
186 extern int dofilewrite( struct proc *p, struct file *fp, int fd,
187 const void *buf, size_t nbyte, off_t offset,
188 int flags, int *retval );
189 extern vm_map_t vm_map_switch( vm_map_t map );
190
191
192 /*
193 * aio external global variables.
194 */
195 extern int aio_max_requests; /* AIO_MAX - configurable */
196 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
197 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
198
199
200 /*
201 * aio static variables.
202 */
203 static aio_anchor_cb aio_anchor;
204 static simple_lock_data_t aio_lock;
205 static struct zone *aio_workq_zonep;
206
207
208 /*
209 * syscall input parameters
210 */
211 #ifndef _SYS_SYSPROTO_H_
212
213 struct aio_cancel_args {
214 int fd;
215 struct aiocb *aiocbp;
216 };
217
218 struct aio_error_args {
219 struct aiocb *aiocbp;
220 };
221
222 struct aio_fsync_args {
223 int op;
224 struct aiocb *aiocbp;
225 };
226
227 struct aio_read_args {
228 struct aiocb *aiocbp;
229 };
230
231 struct aio_return_args {
232 struct aiocb *aiocbp;
233 };
234
235 struct aio_suspend_args {
236 struct aiocb *const *aiocblist;
237 int nent;
238 const struct timespec *timeoutp;
239 };
240
241 struct aio_write_args {
242 struct aiocb *aiocbp;
243 };
244
245 struct lio_listio_args {
246 int mode;
247 struct aiocb *const *aiocblist;
248 int nent;
249 struct sigevent *sigp;
250 };
251
252 #endif /* _SYS_SYSPROTO_H_ */
253
254
255 /*
256 * aio_cancel - attempt to cancel one or more async IO requests currently
257 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
258 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
259 * is NULL then all outstanding async IO request for the given file
260 * descriptor are cancelled (if possible).
261 */
262
263 int
264 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
265 {
266 struct aiocb my_aiocb;
267 int result;
268 boolean_t funnel_state;
269
270 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
271 (int)p, (int)uap->aiocbp, 0, 0, 0 );
272
273 /* quick check to see if there are any async IO requests queued up */
274 AIO_LOCK;
275 result = aio_get_all_queues_count( );
276 AIO_UNLOCK;
277 if ( result < 1 ) {
278 result = EBADF;
279 goto ExitRoutine;
280 }
281
282 *retval = -1;
283 if ( uap->aiocbp != NULL ) {
284 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
285 if ( result != 0 ) {
286 result = EAGAIN;
287 goto ExitRoutine;
288 }
289
290 /* NOTE - POSIX standard says a mismatch between the file */
291 /* descriptor passed in and the file descriptor embedded in */
292 /* the aiocb causes unspecified results. We return EBADF in */
293 /* that situation. */
294 if ( uap->fd != my_aiocb.aio_fildes ) {
295 result = EBADF;
296 goto ExitRoutine;
297 }
298 }
299
300 /* current BSD code assumes funnel lock is held */
301 funnel_state = thread_funnel_set( kernel_flock, TRUE );
302 result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
303 (void) thread_funnel_set( kernel_flock, funnel_state );
304
305 if ( result != -1 ) {
306 *retval = result;
307 result = 0;
308 goto ExitRoutine;
309 }
310
311 result = EBADF;
312
313 ExitRoutine:
314 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
315 (int)p, (int)uap->aiocbp, result, 0, 0 );
316
317 return( result );
318
319 } /* aio_cancel */
320
321
322 /*
323 * _aio_close - internal function used to clean up async IO requests for
324 * a file descriptor that is closing.
325 * NOTE - kernel funnel lock is held when we get called.
326 * THIS MAY BLOCK.
327 */
328
329 __private_extern__ void
330 _aio_close( struct proc *p, int fd )
331 {
332 int error, count;
333
334 /* quick check to see if there are any async IO requests queued up */
335 AIO_LOCK;
336 count = aio_get_all_queues_count( );
337 AIO_UNLOCK;
338 if ( count < 1 )
339 return;
340
341 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
342 (int)p, fd, 0, 0, 0 );
343
344 /* cancel all async IO requests on our todo queues for this file descriptor */
345 error = do_aio_cancel( p, fd, NULL, TRUE, FALSE );
346 if ( error == AIO_NOTCANCELED ) {
347 /*
348 * AIO_NOTCANCELED is returned when we find an aio request for this process
349 * and file descriptor on the active async IO queue. Active requests cannot
350 * be cancelled so we must wait for them to complete. We will get a special
351 * wake up call on our channel used to sleep for ALL active requests to
352 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
353 * when we must wait for all active aio requests.
354 */
355
356 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
357 (int)p, fd, 0, 0, 0 );
358
359 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
360 }
361
362 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
363 (int)p, fd, 0, 0, 0 );
364
365 return;
366
367 } /* _aio_close */
368
369
370 /*
371 * aio_error - return the error status associated with the async IO
372 * request referred to by uap->aiocbp. The error status is the errno
373 * value that would be set by the corresponding IO request (read, wrtie,
374 * fdatasync, or sync).
375 */
376
377 int
378 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
379 {
380 aio_workq_entry *entryp;
381 int error;
382
383 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
384 (int)p, (int)uap->aiocbp, 0, 0, 0 );
385
386 AIO_LOCK;
387
388 /* quick check to see if there are any async IO requests queued up */
389 if ( aio_get_all_queues_count( ) < 1 ) {
390 error = EINVAL;
391 goto ExitRoutine;
392 }
393
394 /* look for a match on our queue of async IO requests that have completed */
395 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
396 if ( entryp->uaiocbp == uap->aiocbp ) {
397 *retval = entryp->errorval;
398 error = 0;
399 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
400 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
401 goto ExitRoutine;
402 }
403 }
404
405 /* look for a match on our queue of active async IO requests */
406 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
407 if ( entryp->uaiocbp == uap->aiocbp ) {
408 *retval = EINPROGRESS;
409 error = 0;
410 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
411 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
412 goto ExitRoutine;
413 }
414 }
415
416 /* look for a match on our queue of todo work */
417 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
418 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
419 *retval = EINPROGRESS;
420 error = 0;
421 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
422 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
423 goto ExitRoutine;
424 }
425 }
426 error = EINVAL;
427
428 ExitRoutine:
429 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
430 (int)p, (int)uap->aiocbp, error, 0, 0 );
431 AIO_UNLOCK;
432
433 return( error );
434
435 } /* aio_error */
436
437
438 /*
439 * aio_fsync - asynchronously force all IO operations associated
440 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
441 * queued at the time of the call to the synchronized completion state.
442 * NOTE - we do not support op O_DSYNC at this point since we do not support the
443 * fdatasync() call.
444 */
445
446 int
447 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
448 {
449 int error;
450 int fsync_kind;
451
452 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
453 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
454
455 *retval = 0;
456 if ( uap->op == O_SYNC )
457 fsync_kind = AIO_FSYNC;
458 #if 0 // we don't support fdatasync() call yet
459 else if ( uap->op == O_DSYNC )
460 fsync_kind = AIO_DSYNC;
461 #endif
462 else {
463 *retval = -1;
464 error = EINVAL;
465 goto ExitRoutine;
466 }
467
468 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
469 if ( error != 0 )
470 *retval = -1;
471
472 ExitRoutine:
473 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
474 (int)p, (int)uap->aiocbp, error, 0, 0 );
475
476 return( error );
477
478 } /* aio_fsync */
479
480
481 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
482 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
483 * (uap->aiocbp->aio_buf).
484 */
485
486 int
487 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
488 {
489 int error;
490
491 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
492 (int)p, (int)uap->aiocbp, 0, 0, 0 );
493
494 *retval = 0;
495
496 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
497 if ( error != 0 )
498 *retval = -1;
499
500 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
501 (int)p, (int)uap->aiocbp, error, 0, 0 );
502
503 return( error );
504
505 } /* aio_read */
506
507
508 /*
509 * aio_return - return the return status associated with the async IO
510 * request referred to by uap->aiocbp. The return status is the value
511 * that would be returned by corresponding IO request (read, wrtie,
512 * fdatasync, or sync). This is where we release kernel resources
513 * held for async IO call associated with the given aiocb pointer.
514 */
515
516 int
517 aio_return( struct proc *p, struct aio_return_args *uap, register_t *retval )
518 {
519 aio_workq_entry *entryp;
520 int error;
521 boolean_t lock_held;
522
523 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
524 (int)p, (int)uap->aiocbp, 0, 0, 0 );
525
526 AIO_LOCK;
527 lock_held = TRUE;
528 *retval = 0;
529
530 /* quick check to see if there are any async IO requests queued up */
531 if ( aio_get_all_queues_count( ) < 1 ) {
532 error = EINVAL;
533 goto ExitRoutine;
534 }
535
536 /* look for a match on our queue of async IO requests that have completed */
537 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
538 if ( entryp->uaiocbp == uap->aiocbp ) {
539 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
540 aio_anchor.aio_done_count--;
541 p->aio_done_count--;
542
543 *retval = entryp->returnval;
544
545 /* we cannot free requests that are still completing */
546 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
547 vm_map_t my_map;
548
549 my_map = entryp->aio_map;
550 entryp->aio_map = VM_MAP_NULL;
551 AIO_UNLOCK;
552 lock_held = FALSE;
553 aio_free_request( entryp, my_map );
554 }
555 else
556 /* tell completion code to free this request */
557 entryp->flags |= AIO_DO_FREE;
558 error = 0;
559 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
560 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
561 goto ExitRoutine;
562 }
563 }
564
565 /* look for a match on our queue of active async IO requests */
566 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
567 if ( entryp->uaiocbp == uap->aiocbp ) {
568 error = EINPROGRESS;
569 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
570 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
571 goto ExitRoutine;
572 }
573 }
574
575 /* look for a match on our queue of todo work */
576 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
577 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
578 error = EINPROGRESS;
579 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
580 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
581 goto ExitRoutine;
582 }
583 }
584 error = EINVAL;
585
586 ExitRoutine:
587 if ( lock_held )
588 AIO_UNLOCK;
589 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
590 (int)p, (int)uap->aiocbp, error, 0, 0 );
591
592 return( error );
593
594 } /* aio_return */
595
596
597 /*
598 * _aio_exec - internal function used to clean up async IO requests for
599 * a process that is going away due to exec(). We cancel any async IOs
600 * we can and wait for those already active. We also disable signaling
601 * for cancelled or active aio requests that complete.
602 * NOTE - kernel funnel lock is held when we get called.
603 * This routine MAY block!
604 */
605
606 __private_extern__ void
607 _aio_exec( struct proc *p )
608 {
609
610 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
611 (int)p, 0, 0, 0, 0 );
612
613 _aio_exit( p );
614
615 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
616 (int)p, 0, 0, 0, 0 );
617
618 return;
619
620 } /* _aio_exec */
621
622
623 /*
624 * _aio_exit - internal function used to clean up async IO requests for
625 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
626 * we can and wait for those already active. We also disable signaling
627 * for cancelled or active aio requests that complete. This routine MAY block!
628 * NOTE - kernel funnel lock is held when we get called.
629 */
630
631 __private_extern__ void
632 _aio_exit( struct proc *p )
633 {
634 int error, count;
635 aio_workq_entry *entryp;
636
637 /* quick check to see if there are any async IO requests queued up */
638 AIO_LOCK;
639 count = aio_get_all_queues_count( );
640 AIO_UNLOCK;
641 if ( count < 1 ) {
642 return;
643 }
644
645 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
646 (int)p, 0, 0, 0, 0 );
647
648 /*
649 * cancel async IO requests on the todo work queue and wait for those
650 * already active to complete.
651 */
652 error = do_aio_cancel( p, 0, NULL, TRUE, TRUE );
653 if ( error == AIO_NOTCANCELED ) {
654 /*
655 * AIO_NOTCANCELED is returned when we find an aio request for this process
656 * on the active async IO queue. Active requests cannot be cancelled so we
657 * must wait for them to complete. We will get a special wake up call on
658 * our channel used to sleep for ALL active requests to complete. This sleep
659 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
660 * active aio requests.
661 */
662
663 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
664 (int)p, 0, 0, 0, 0 );
665
666 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
667 }
668
669 /* release all aio resources used by this process */
670 AIO_LOCK;
671 entryp = TAILQ_FIRST( &p->aio_doneq );
672 while ( entryp != NULL ) {
673 aio_workq_entry *next_entryp;
674
675 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
676 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
677 aio_anchor.aio_done_count--;
678 p->aio_done_count--;
679
680 /* we cannot free requests that are still completing */
681 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
682 vm_map_t my_map;
683
684 my_map = entryp->aio_map;
685 entryp->aio_map = VM_MAP_NULL;
686 AIO_UNLOCK;
687 aio_free_request( entryp, my_map );
688
689 /* need to start over since aio_doneq may have been */
690 /* changed while we were away. */
691 AIO_LOCK;
692 entryp = TAILQ_FIRST( &p->aio_doneq );
693 continue;
694 }
695 else
696 /* tell completion code to free this request */
697 entryp->flags |= AIO_DO_FREE;
698 entryp = next_entryp;
699 }
700 AIO_UNLOCK;
701
702 ExitRoutine:
703 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
704 (int)p, 0, 0, 0, 0 );
705
706 return;
707
708 } /* _aio_exit */
709
710
711 /*
712 * do_aio_cancel - cancel async IO requests (if possible). We get called by
713 * aio_cancel, close, and at exit.
714 * There are three modes of operation: 1) cancel all async IOs for a process -
715 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
716 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
717 * aiocbp.
718 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
719 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
720 * target async IO requests, and AIO_ALLDONE if all target async IO requests
721 * were already complete.
722 * WARNING - do not deference aiocbp in this routine, it may point to user
723 * land data that has not been copied in (when called from aio_cancel() )
724 * NOTE - kernel funnel lock is held when we get called.
725 */
726
727 static int
728 do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp,
729 boolean_t wait_for_completion, boolean_t disable_notification )
730 {
731 aio_workq_entry *entryp;
732 int result;
733
734 result = -1;
735
736 /* look for a match on our queue of async todo work. */
737 AIO_LOCK;
738 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
739 while ( entryp != NULL ) {
740 aio_workq_entry *next_entryp;
741
742 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
743 if ( p == entryp->procp ) {
744 if ( (aiocbp == NULL && fd == 0) ||
745 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
746 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
747 /* we found a match so we remove the entry from the */
748 /* todo work queue and place it on the done queue */
749 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
750 aio_anchor.aio_async_workq_count--;
751 entryp->errorval = ECANCELED;
752 entryp->returnval = -1;
753 if ( disable_notification )
754 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
755 result = AIO_CANCELED;
756
757 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
758 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
759
760 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
761 aio_anchor.aio_done_count++;
762 p->aio_done_count++;
763 entryp->flags |= AIO_COMPLETION;
764 AIO_UNLOCK;
765
766 /* do completion processing for this request */
767 do_aio_completion( entryp );
768
769 AIO_LOCK;
770 entryp->flags &= ~AIO_COMPLETION;
771 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
772 vm_map_t my_map;
773
774 my_map = entryp->aio_map;
775 entryp->aio_map = VM_MAP_NULL;
776 AIO_UNLOCK;
777 aio_free_request( entryp, my_map );
778 }
779 else
780 AIO_UNLOCK;
781
782 if ( aiocbp != NULL ) {
783 return( result );
784 }
785
786 /* need to start over since aio_async_workq may have been */
787 /* changed while we were away doing completion processing. */
788 AIO_LOCK;
789 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
790 continue;
791 }
792 }
793 entryp = next_entryp;
794 } /* while... */
795
796 /*
797 * look for a match on our queue of synchronous todo work. This will
798 * be a rare occurrence but could happen if a process is terminated while
799 * processing a lio_listio call.
800 */
801 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
802 while ( entryp != NULL ) {
803 aio_workq_entry *next_entryp;
804
805 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
806 if ( p == entryp->procp ) {
807 if ( (aiocbp == NULL && fd == 0) ||
808 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
809 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
810 /* we found a match so we remove the entry from the */
811 /* todo work queue and place it on the done queue */
812 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
813 aio_anchor.lio_sync_workq_count--;
814 entryp->errorval = ECANCELED;
815 entryp->returnval = -1;
816 if ( disable_notification )
817 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
818 result = AIO_CANCELED;
819
820 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
821 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
822
823 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
824 aio_anchor.aio_done_count++;
825 p->aio_done_count++;
826 if ( aiocbp != NULL ) {
827 AIO_UNLOCK;
828 return( result );
829 }
830 }
831 }
832 entryp = next_entryp;
833 } /* while... */
834
835 /*
836 * look for a match on our queue of active async IO requests and
837 * return AIO_NOTCANCELED result.
838 */
839 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
840 if ( (aiocbp == NULL && fd == 0) ||
841 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
842 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
843 result = AIO_NOTCANCELED;
844
845 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
846 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
847
848 if ( wait_for_completion )
849 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
850 if ( disable_notification )
851 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
852 if ( aiocbp != NULL ) {
853 AIO_UNLOCK;
854 return( result );
855 }
856 }
857 }
858
859 /*
860 * if we didn't find any matches on the todo or active queues then look for a
861 * match on our queue of async IO requests that have completed and if found
862 * return AIO_ALLDONE result.
863 */
864 if ( result == -1 ) {
865 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
866 if ( (aiocbp == NULL && fd == 0) ||
867 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
868 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
869 result = AIO_ALLDONE;
870
871 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
872 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
873
874 if ( aiocbp != NULL ) {
875 AIO_UNLOCK;
876 return( result );
877 }
878 }
879 }
880 }
881 AIO_UNLOCK;
882
883 return( result );
884
885 } /* do_aio_cancel */
886
887
888 /*
889 * aio_suspend - suspend the calling thread until at least one of the async
890 * IO operations referenced by uap->aiocblist has completed, until a signal
891 * interrupts the function, or uap->timeoutp time interval (optional) has
892 * passed.
893 * Returns 0 if one or more async IOs have completed else -1 and errno is
894 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
895 * woke us up.
896 */
897
898 int
899 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
900 {
901 int error;
902 int i, count;
903 uint64_t abstime;
904 struct timespec ts;
905 struct timeval tv;
906 aio_workq_entry *entryp;
907 struct aiocb * *aiocbpp;
908
909 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
910 (int)p, uap->nent, 0, 0, 0 );
911
912 *retval = -1;
913 abstime = 0;
914 aiocbpp = NULL;
915
916 /* quick check to see if there are any async IO requests queued up */
917 AIO_LOCK;
918 count = aio_get_all_queues_count( );
919 AIO_UNLOCK;
920 if ( count < 1 ) {
921 error = EINVAL;
922 goto ExitThisRoutine;
923 }
924
925 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
926 error = EINVAL;
927 goto ExitThisRoutine;
928 }
929
930 if ( uap->timeoutp != NULL ) {
931 error = copyin( (void *)uap->timeoutp, &ts, sizeof(ts) );
932 if ( error != 0 ) {
933 error = EAGAIN;
934 goto ExitThisRoutine;
935 }
936
937 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
938 error = EINVAL;
939 goto ExitThisRoutine;
940 }
941
942 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
943 &abstime );
944 clock_absolutetime_interval_to_deadline( abstime, &abstime );
945 }
946
947 MALLOC( aiocbpp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
948 if ( aiocbpp == NULL ) {
949 error = EAGAIN;
950 goto ExitThisRoutine;
951 }
952
953 /* copyin our aiocb pointers from list */
954 for ( i = 0; i < uap->nent; i++ ) {
955 struct aiocb *aiocbp;
956
957 /* copyin in aiocb pointer from list */
958 error = copyin( (void *)(uap->aiocblist + i), (aiocbpp + i), sizeof(*aiocbpp) );
959 if ( error != 0 ) {
960 error = EAGAIN;
961 goto ExitThisRoutine;
962 }
963 } /* for ( ; i < uap->nent; ) */
964
965 /* check list of aio requests to see if any have completed */
966 AIO_LOCK;
967 for ( i = 0; i < uap->nent; i++ ) {
968 struct aiocb *aiocbp;
969
970 /* NULL elements are legal so check for 'em */
971 aiocbp = *(aiocbpp + i);
972 if ( aiocbp == NULL )
973 continue;
974
975 /* return immediately if any aio request in the list is done */
976 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
977 if ( entryp->uaiocbp == aiocbp ) {
978 *retval = 0;
979 error = 0;
980 AIO_UNLOCK;
981 goto ExitThisRoutine;
982 }
983 }
984 } /* for ( ; i < uap->nent; ) */
985
986 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
987 (int)p, uap->nent, 0, 0, 0 );
988
989 /*
990 * wait for an async IO to complete or a signal fires or timeout expires.
991 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
992 * interrupts us. If an async IO completes before a signal fires or our
993 * timeout expires, we get a wakeup call from aio_work_thread(). We do not
994 * use tsleep() here in order to avoid getting kernel funnel lock.
995 */
996 assert_wait( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE );
997 AIO_UNLOCK;
998
999 if ( abstime > 0 ) {
1000 thread_set_timer_deadline( abstime );
1001 }
1002 error = thread_block( THREAD_CONTINUE_NULL );
1003
1004 if ( error == THREAD_AWAKENED ) {
1005 /* got our wakeup call from aio_work_thread() */
1006 if ( abstime > 0 ) {
1007 thread_cancel_timer();
1008 }
1009 *retval = 0;
1010 error = 0;
1011 }
1012 else if ( error == THREAD_TIMED_OUT ) {
1013 /* our timeout expired */
1014 error = EAGAIN;
1015 }
1016 else {
1017 /* we were interrupted */
1018 if ( abstime > 0 ) {
1019 thread_cancel_timer();
1020 }
1021 error = EINTR;
1022 }
1023
1024 ExitThisRoutine:
1025 if ( aiocbpp != NULL )
1026 FREE( aiocbpp, M_TEMP );
1027
1028 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1029 (int)p, uap->nent, error, 0, 0 );
1030
1031 return( error );
1032
1033 } /* aio_suspend */
1034
1035
1036 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1037 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1038 * (uap->aiocbp->aio_buf).
1039 */
1040
1041 int
1042 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1043 {
1044 int error;
1045
1046 *retval = 0;
1047
1048 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1049 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1050
1051 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1052 if ( error != 0 )
1053 *retval = -1;
1054
1055 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1056 (int)p, (int)uap->aiocbp, error, 0, 0 );
1057
1058 return( error );
1059
1060 } /* aio_write */
1061
1062
1063 /*
1064 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1065 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1066 * The caller gets error and return status for each aiocb in the list via aio_error
1067 * and aio_return. We must keep completed requests until released by the
1068 * aio_return call.
1069 */
1070
1071 int
1072 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1073 {
1074 int i;
1075 int call_result;
1076 int result;
1077 long group_tag;
1078 aio_workq_entry * *entryp_listp;
1079
1080 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1081 (int)p, uap->nent, uap->mode, 0, 0 );
1082
1083 entryp_listp = NULL;
1084 call_result = -1;
1085 *retval = -1;
1086 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1087 call_result = EINVAL;
1088 goto ExitRoutine;
1089 }
1090
1091 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1092 call_result = EINVAL;
1093 goto ExitRoutine;
1094 }
1095
1096 /*
1097 * we use group_tag to mark IO requests for delayed completion processing
1098 * which means we wait until all IO requests in the group have completed
1099 * before we either return to the caller when mode is LIO_WAIT or signal
1100 * user when mode is LIO_NOWAIT.
1101 */
1102 group_tag = random();
1103
1104 /*
1105 * allocate a list of aio_workq_entry pointers that we will use to queue
1106 * up all our requests at once while holding our lock.
1107 */
1108 MALLOC( entryp_listp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
1109 if ( entryp_listp == NULL ) {
1110 call_result = EAGAIN;
1111 goto ExitRoutine;
1112 }
1113
1114 /* process list of aio requests */
1115 for ( i = 0; i < uap->nent; i++ ) {
1116 struct aiocb *my_aiocbp;
1117
1118 *(entryp_listp + i) = NULL;
1119
1120 /* copyin in aiocb pointer from list */
1121 result = copyin( (void *)(uap->aiocblist + i), &my_aiocbp, sizeof(my_aiocbp) );
1122 if ( result != 0 ) {
1123 call_result = EAGAIN;
1124 continue;
1125 }
1126
1127 /* NULL elements are legal so check for 'em */
1128 if ( my_aiocbp == NULL )
1129 continue;
1130
1131 if ( uap->mode == LIO_NOWAIT )
1132 result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1133 group_tag, (entryp_listp + i) );
1134 else
1135 result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1136 (entryp_listp + i) );
1137
1138 if ( result != 0 && call_result == -1 )
1139 call_result = result;
1140 }
1141
1142 /*
1143 * we need to protect this section since we do not want any of these grouped
1144 * IO requests to begin until we have them all on the queue.
1145 */
1146 AIO_LOCK;
1147 for ( i = 0; i < uap->nent; i++ ) {
1148 aio_workq_entry *entryp;
1149
1150 /* NULL elements are legal so check for 'em */
1151 entryp = *(entryp_listp + i);
1152 if ( entryp == NULL )
1153 continue;
1154
1155 /* check our aio limits to throttle bad or rude user land behavior */
1156 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1157 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1158 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1159 vm_map_t my_map;
1160
1161 my_map = entryp->aio_map;
1162 entryp->aio_map = VM_MAP_NULL;
1163 result = EAGAIN;
1164 AIO_UNLOCK;
1165 aio_free_request( entryp, my_map );
1166 AIO_LOCK;
1167 continue;
1168 }
1169
1170 /* place the request on the appropriate queue */
1171 if ( uap->mode == LIO_NOWAIT ) {
1172 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1173 aio_anchor.aio_async_workq_count++;
1174
1175 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1176 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1177 }
1178 else {
1179 TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1180 aio_anchor.lio_sync_workq_count++;
1181 }
1182 }
1183
1184 if ( uap->mode == LIO_NOWAIT ) {
1185 /* caller does not want to wait so we'll fire off a worker thread and return */
1186 wakeup_one( &aio_anchor.aio_async_workq );
1187 }
1188 else {
1189 aio_workq_entry *entryp;
1190 int error;
1191
1192 /*
1193 * mode is LIO_WAIT - handle the IO requests now.
1194 */
1195 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1196 while ( entryp != NULL ) {
1197 if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1198 boolean_t funnel_state;
1199
1200 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1201 aio_anchor.lio_sync_workq_count--;
1202 AIO_UNLOCK;
1203
1204 // file system IO code path requires kernel funnel lock
1205 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1206 if ( (entryp->flags & AIO_READ) != 0 ) {
1207 error = do_aio_read( entryp );
1208 }
1209 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1210 error = do_aio_write( entryp );
1211 }
1212 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1213 error = do_aio_fsync( entryp );
1214 }
1215 else {
1216 printf( "%s - unknown aio request - flags 0x%02X \n",
1217 __FUNCTION__, entryp->flags );
1218 error = EINVAL;
1219 }
1220 entryp->errorval = error;
1221 if ( error != 0 && call_result == -1 )
1222 call_result = EIO;
1223 (void) thread_funnel_set( kernel_flock, funnel_state );
1224
1225 AIO_LOCK;
1226 /* we're done with the IO request so move it on the done queue */
1227 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1228 aio_anchor.aio_done_count++;
1229 p->aio_done_count++;
1230
1231 /* need to start over since lio_sync_workq may have been changed while we */
1232 /* were away doing the IO. */
1233 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1234 continue;
1235 } /* p == entryp->procp */
1236
1237 entryp = TAILQ_NEXT( entryp, aio_workq_link );
1238 } /* while ( entryp != NULL ) */
1239 } /* uap->mode == LIO_WAIT */
1240 AIO_UNLOCK;
1241
1242 /* call_result == -1 means we had no trouble queueing up requests */
1243 if ( call_result == -1 ) {
1244 call_result = 0;
1245 *retval = 0;
1246 }
1247
1248 ExitRoutine:
1249 if ( entryp_listp != NULL )
1250 FREE( entryp_listp, M_TEMP );
1251
1252 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1253 (int)p, call_result, 0, 0, 0 );
1254
1255 return( call_result );
1256
1257 } /* lio_listio */
1258
1259
1260 /*
1261 * aio worker thread. this is where all the real work gets done.
1262 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1263 * after new work is queued up.
1264 */
1265
1266 static void
1267 aio_work_thread( void )
1268 {
1269 aio_workq_entry *entryp;
1270 struct uthread *uthread = (struct uthread *)get_bsdthread_info(current_act());
1271
1272 for( ;; ) {
1273 AIO_LOCK;
1274 entryp = aio_get_some_work();
1275 if ( entryp == NULL ) {
1276 /*
1277 * aio worker threads wait for some work to get queued up
1278 * by aio_queue_async_request. Once some work gets queued
1279 * it will wake up one of these worker threads just before
1280 * returning to our caller in user land. We do not use
1281 * tsleep() here in order to avoid getting kernel funnel lock.
1282 */
1283 assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1284 AIO_UNLOCK;
1285 thread_block( THREAD_CONTINUE_NULL );
1286
1287 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_wake)) | DBG_FUNC_NONE,
1288 0, 0, 0, 0, 0 );
1289 }
1290 else {
1291 int error;
1292 boolean_t funnel_state;
1293 vm_map_t currentmap;
1294 vm_map_t oldmap = VM_MAP_NULL;
1295 task_t oldaiotask = TASK_NULL;
1296
1297 AIO_UNLOCK;
1298
1299 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1300 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1301
1302 /*
1303 * Assume the target's address space identity for the duration
1304 * of the IO.
1305 */
1306 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1307
1308 currentmap = get_task_map( (current_proc())->task );
1309 if ( currentmap != entryp->aio_map ) {
1310 oldaiotask = uthread->uu_aio_task;
1311 uthread->uu_aio_task = entryp->procp->task;
1312 oldmap = vm_map_switch( entryp->aio_map );
1313 }
1314
1315 if ( (entryp->flags & AIO_READ) != 0 ) {
1316 error = do_aio_read( entryp );
1317 }
1318 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1319 error = do_aio_write( entryp );
1320 }
1321 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1322 error = do_aio_fsync( entryp );
1323 }
1324 else {
1325 printf( "%s - unknown aio request - flags 0x%02X \n",
1326 __FUNCTION__, entryp->flags );
1327 error = EINVAL;
1328 }
1329 entryp->errorval = error;
1330 if ( currentmap != entryp->aio_map ) {
1331 (void) vm_map_switch( oldmap );
1332 uthread->uu_aio_task = oldaiotask;
1333 }
1334
1335 /* we're done with the IO request so pop it off the active queue and */
1336 /* push it on the done queue */
1337 AIO_LOCK;
1338 TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1339 aio_anchor.aio_active_count--;
1340 entryp->procp->aio_active_count--;
1341 TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1342 aio_anchor.aio_done_count++;
1343 entryp->procp->aio_done_count++;
1344 entryp->flags |= AIO_COMPLETION;
1345
1346 /* remove our reference to the user land map. */
1347 if ( VM_MAP_NULL != entryp->aio_map ) {
1348 vm_map_t my_map;
1349
1350 my_map = entryp->aio_map;
1351 entryp->aio_map = VM_MAP_NULL;
1352 AIO_UNLOCK; /* must unlock before calling vm_map_deallocate() */
1353 vm_map_deallocate( my_map );
1354 }
1355 else {
1356 AIO_UNLOCK;
1357 }
1358
1359 do_aio_completion( entryp );
1360 (void) thread_funnel_set( kernel_flock, funnel_state );
1361
1362 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1363 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1364 entryp->returnval, 0 );
1365
1366 AIO_LOCK;
1367 entryp->flags &= ~AIO_COMPLETION;
1368 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1369 vm_map_t my_map;
1370
1371 my_map = entryp->aio_map;
1372 entryp->aio_map = VM_MAP_NULL;
1373 AIO_UNLOCK;
1374 aio_free_request( entryp, my_map );
1375 }
1376 else
1377 AIO_UNLOCK;
1378 }
1379 } /* for ( ;; ) */
1380
1381 /* NOT REACHED */
1382
1383 } /* aio_work_thread */
1384
1385
1386 /*
1387 * aio_get_some_work - get the next async IO request that is ready to be executed.
1388 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1389 * IO requests at the time the aio_fsync call came in have completed.
1390 * NOTE - AIO_LOCK must be held by caller
1391 */
1392
1393 static aio_workq_entry *
1394 aio_get_some_work( void )
1395 {
1396 aio_workq_entry *entryp;
1397 int skip_count = 0;
1398
1399 /* pop some work off the work queue and add to our active queue */
1400 for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1401 entryp != NULL;
1402 entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1403
1404 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1405 /* leave aio_fsync calls on the work queue if there are IO */
1406 /* requests on the active queue for the same file descriptor. */
1407 if ( aio_delay_fsync_request( entryp ) ) {
1408
1409 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1410 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1411 continue;
1412 }
1413 }
1414 break;
1415 }
1416
1417 if ( entryp != NULL ) {
1418 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1419 aio_anchor.aio_async_workq_count--;
1420 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1421 aio_anchor.aio_active_count++;
1422 entryp->procp->aio_active_count++;
1423 }
1424
1425 return( entryp );
1426
1427 } /* aio_get_some_work */
1428
1429
1430 /*
1431 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1432 * this time. Delay will happen when there are any active IOs for the same file
1433 * descriptor that were queued at time the aio_sync call was queued.
1434 * NOTE - AIO_LOCK must be held by caller
1435 */
1436 static boolean_t
1437 aio_delay_fsync_request( aio_workq_entry *entryp )
1438 {
1439 aio_workq_entry *my_entryp;
1440
1441 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1442 if ( my_entryp->fsyncp != NULL &&
1443 entryp->uaiocbp == my_entryp->fsyncp &&
1444 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1445 return( TRUE );
1446 }
1447 }
1448
1449 return( FALSE );
1450
1451 } /* aio_delay_fsync_request */
1452
1453
1454 /*
1455 * aio_queue_async_request - queue up an async IO request on our work queue then
1456 * wake up one of our worker threads to do the actual work. We get a reference
1457 * to our caller's user land map in order to keep it around while we are
1458 * processing the request.
1459 */
1460
1461 static int
1462 aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO )
1463 {
1464 aio_workq_entry *entryp;
1465 int result;
1466
1467 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1468 if ( entryp == NULL ) {
1469 result = EAGAIN;
1470 goto error_exit;
1471 }
1472 bzero( entryp, sizeof(*entryp) );
1473
1474 /* fill in the rest of the aio_workq_entry */
1475 entryp->procp = procp;
1476 entryp->uaiocbp = aiocbp;
1477 entryp->flags |= kindOfIO;
1478 entryp->aio_map = VM_MAP_NULL;
1479 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1480 if ( result != 0 ) {
1481 result = EAGAIN;
1482 goto error_exit;
1483 }
1484
1485 /* do some more validation on the aiocb and embedded file descriptor */
1486 result = aio_validate( entryp );
1487 if ( result != 0 )
1488 goto error_exit;
1489
1490 /* get a reference to the user land map in order to keep it around */
1491 entryp->aio_map = get_task_map( procp->task );
1492 vm_map_reference( entryp->aio_map );
1493
1494 AIO_LOCK;
1495
1496 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1497 AIO_UNLOCK;
1498 result = EAGAIN;
1499 goto error_exit;
1500 }
1501
1502 /* check our aio limits to throttle bad or rude user land behavior */
1503 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1504 aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1505 AIO_UNLOCK;
1506 result = EAGAIN;
1507 goto error_exit;
1508 }
1509
1510 /*
1511 * aio_fsync calls sync up all async IO requests queued at the time
1512 * the aio_fsync call was made. So we mark each currently queued async
1513 * IO with a matching file descriptor as must complete before we do the
1514 * fsync. We set the fsyncp field of each matching async IO
1515 * request with the aiocb pointer passed in on the aio_fsync call to
1516 * know which IOs must complete before we process the aio_fsync call.
1517 */
1518 if ( (kindOfIO & AIO_FSYNC) != 0 )
1519 aio_mark_requests( entryp );
1520
1521 /* queue up on our aio asynchronous work queue */
1522 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1523 aio_anchor.aio_async_workq_count++;
1524
1525 wakeup_one( &aio_anchor.aio_async_workq );
1526 AIO_UNLOCK;
1527
1528 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1529 (int)procp, (int)aiocbp, 0, 0, 0 );
1530
1531 return( 0 );
1532
1533 error_exit:
1534 if ( entryp != NULL ) {
1535 /* this entry has not been queued up so no worries about unlocked */
1536 /* state and aio_map */
1537 aio_free_request( entryp, entryp->aio_map );
1538 }
1539
1540 return( result );
1541
1542 } /* aio_queue_async_request */
1543
1544
1545 /*
1546 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1547 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1548 * our caller. We get a reference to our caller's user land map in order to keep
1549 * it around while we are processing the request.
1550 * lio_listio calls behave differently at completion they do completion notification
1551 * when all async IO requests have completed. We use group_tag to tag IO requests
1552 * that behave in the delay notification manner.
1553 */
1554
1555 static int
1556 lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp,
1557 struct sigevent *sigp, long group_tag,
1558 aio_workq_entry **entrypp )
1559 {
1560 aio_workq_entry *entryp;
1561 int result;
1562
1563 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1564 if ( entryp == NULL ) {
1565 result = EAGAIN;
1566 goto error_exit;
1567 }
1568 bzero( entryp, sizeof(*entryp) );
1569
1570 /* fill in the rest of the aio_workq_entry */
1571 entryp->procp = procp;
1572 entryp->uaiocbp = aiocbp;
1573 entryp->flags |= AIO_LIO;
1574 entryp->group_tag = group_tag;
1575 entryp->aio_map = VM_MAP_NULL;
1576 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1577 if ( result != 0 ) {
1578 result = EAGAIN;
1579 goto error_exit;
1580 }
1581
1582 /* look for lio_listio LIO_NOP requests and ignore them. */
1583 /* Not really an error, but we need to free our aio_workq_entry. */
1584 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1585 result = 0;
1586 goto error_exit;
1587 }
1588
1589 /* use sigevent passed in to lio_listio for each of our calls, but only */
1590 /* do completion notification after the last request completes. */
1591 if ( sigp != NULL ) {
1592 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1593 if ( result != 0 ) {
1594 result = EAGAIN;
1595 goto error_exit;
1596 }
1597 }
1598
1599 /* do some more validation on the aiocb and embedded file descriptor */
1600 result = aio_validate( entryp );
1601 if ( result != 0 )
1602 goto error_exit;
1603
1604 /* get a reference to the user land map in order to keep it around */
1605 entryp->aio_map = get_task_map( procp->task );
1606 vm_map_reference( entryp->aio_map );
1607
1608 *entrypp = entryp;
1609 return( 0 );
1610
1611 error_exit:
1612 if ( entryp != NULL )
1613 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1614
1615 return( result );
1616
1617 } /* lio_create_async_entry */
1618
1619
1620 /*
1621 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1622 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1623 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1624 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1625 * NOTE - AIO_LOCK must be held by caller
1626 */
1627
1628 static void
1629 aio_mark_requests( aio_workq_entry *entryp )
1630 {
1631 aio_workq_entry *my_entryp;
1632
1633 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1634 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1635 my_entryp->fsyncp = entryp->uaiocbp;
1636 }
1637 }
1638
1639 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1640 if ( entryp->procp == my_entryp->procp &&
1641 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1642 my_entryp->fsyncp = entryp->uaiocbp;
1643 }
1644 }
1645
1646 } /* aio_mark_requests */
1647
1648
1649 /*
1650 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1651 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1652 * our caller.
1653 * lio_listio calls behave differently at completion they do completion notification
1654 * when all async IO requests have completed. We use group_tag to tag IO requests
1655 * that behave in the delay notification manner.
1656 */
1657
1658 static int
1659 lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp,
1660 long group_tag, aio_workq_entry **entrypp )
1661 {
1662 aio_workq_entry *entryp;
1663 int result;
1664
1665 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1666 if ( entryp == NULL ) {
1667 result = EAGAIN;
1668 goto error_exit;
1669 }
1670 bzero( entryp, sizeof(*entryp) );
1671
1672 /* fill in the rest of the aio_workq_entry */
1673 entryp->procp = procp;
1674 entryp->uaiocbp = aiocbp;
1675 entryp->flags |= AIO_LIO;
1676 entryp->group_tag = group_tag;
1677 entryp->aio_map = VM_MAP_NULL;
1678 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1679 if ( result != 0 ) {
1680 result = EAGAIN;
1681 goto error_exit;
1682 }
1683
1684 /* look for lio_listio LIO_NOP requests and ignore them. */
1685 /* Not really an error, but we need to free our aio_workq_entry. */
1686 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1687 result = 0;
1688 goto error_exit;
1689 }
1690
1691 result = aio_validate( entryp );
1692 if ( result != 0 ) {
1693 goto error_exit;
1694 }
1695
1696 *entrypp = entryp;
1697 return( 0 );
1698
1699 error_exit:
1700 if ( entryp != NULL )
1701 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1702
1703 return( result );
1704
1705 } /* lio_create_sync_entry */
1706
1707
1708 /*
1709 * aio_free_request - remove our reference on the user land map and
1710 * free the work queue entry resources.
1711 * We are not holding the lock here thus aio_map is passed in and
1712 * zeroed while we did have the lock.
1713 */
1714
1715 static int
1716 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1717 {
1718 /* remove our reference to the user land map. */
1719 if ( VM_MAP_NULL != the_map ) {
1720 vm_map_deallocate( the_map );
1721 }
1722
1723 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1724
1725 return( 0 );
1726
1727 } /* aio_free_request */
1728
1729
1730 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1731 */
1732
1733 static int
1734 aio_validate( aio_workq_entry *entryp )
1735 {
1736 boolean_t funnel_state;
1737 struct file *fp;
1738 int flag;
1739 int result;
1740
1741 result = 0;
1742
1743 if ( (entryp->flags & AIO_LIO) != 0 ) {
1744 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1745 entryp->flags |= AIO_READ;
1746 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1747 entryp->flags |= AIO_WRITE;
1748 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1749 return( 0 );
1750 else
1751 return( EINVAL );
1752 }
1753
1754 flag = FREAD;
1755 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1756 flag = FWRITE;
1757 }
1758
1759 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1760 if ( entryp->aiocb.aio_offset < 0 ||
1761 entryp->aiocb.aio_nbytes < 0 ||
1762 entryp->aiocb.aio_nbytes > INT_MAX ||
1763 entryp->aiocb.aio_buf == NULL )
1764 return( EINVAL );
1765 }
1766
1767 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1768 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1769 * sigev_notify_function, and sigev_notify_attributes are ignored.
1770 */
1771 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1772 int signum;
1773 /* make sure we have a valid signal number */
1774 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1775 if ( signum <= 0 || signum >= NSIG ||
1776 signum == SIGKILL || signum == SIGSTOP )
1777 return (EINVAL);
1778 }
1779 else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1780 return (EINVAL);
1781
1782 /* validate the file descriptor and that the file was opened
1783 * for the appropriate read / write access. This section requires
1784 * kernel funnel lock.
1785 */
1786 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1787
1788 result = fdgetf( entryp->procp, entryp->aiocb.aio_fildes, &fp );
1789 if ( result == 0 ) {
1790 if ( (fp->f_flag & flag) == 0 ) {
1791 /* we don't have read or write access */
1792 result = EBADF;
1793 }
1794 else if ( fp->f_type != DTYPE_VNODE ) {
1795 /* this is not a file */
1796 result = ESPIPE;
1797 }
1798 }
1799 else {
1800 result = EBADF;
1801 }
1802
1803 (void) thread_funnel_set( kernel_flock, funnel_state );
1804
1805 return( result );
1806
1807 } /* aio_validate */
1808
1809
1810 /*
1811 * aio_get_process_count - runs through our queues that hold outstanding
1812 * async IO reqests and totals up number of requests for the given
1813 * process.
1814 * NOTE - caller must hold aio lock!
1815 */
1816
1817 static int
1818 aio_get_process_count( struct proc *procp )
1819 {
1820 aio_workq_entry *entryp;
1821 int error;
1822 int count;
1823
1824 /* begin with count of completed async IO requests for this process */
1825 count = procp->aio_done_count;
1826
1827 /* add in count of active async IO requests for this process */
1828 count += procp->aio_active_count;
1829
1830 /* look for matches on our queue of asynchronous todo work */
1831 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1832 if ( procp == entryp->procp ) {
1833 count++;
1834 }
1835 }
1836
1837 /* look for matches on our queue of synchronous todo work */
1838 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1839 if ( procp == entryp->procp ) {
1840 count++;
1841 }
1842 }
1843
1844 return( count );
1845
1846 } /* aio_get_process_count */
1847
1848
1849 /*
1850 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1851 * NOTE - caller must hold aio lock!
1852 */
1853
1854 static int
1855 aio_get_all_queues_count( void )
1856 {
1857 int count;
1858
1859 count = aio_anchor.aio_async_workq_count;
1860 count += aio_anchor.lio_sync_workq_count;
1861 count += aio_anchor.aio_active_count;
1862 count += aio_anchor.aio_done_count;
1863
1864 return( count );
1865
1866 } /* aio_get_all_queues_count */
1867
1868
1869 /*
1870 * do_aio_completion. Handle async IO completion.
1871 */
1872
1873 static void
1874 do_aio_completion( aio_workq_entry *entryp )
1875 {
1876 /* signal user land process if appropriate */
1877 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1878 (entryp->flags & AIO_DISABLE) == 0 ) {
1879
1880 /*
1881 * if group_tag is non zero then make sure this is the last IO request
1882 * in the group before we signal.
1883 */
1884 if ( entryp->group_tag == 0 ||
1885 (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1886 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1887 (int)entryp->procp, (int)entryp->uaiocbp,
1888 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1889
1890 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1891 return;
1892 }
1893 }
1894
1895 /*
1896 * need to handle case where a process is trying to exit, exec, or close
1897 * and is currently waiting for active aio requests to complete. If
1898 * AIO_WAITING is set then we need to look to see if there are any
1899 * other requests in the active queue for this process. If there are
1900 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1901 * there are some still active then do nothing - we only want to wakeup
1902 * when all active aio requests for the process are complete.
1903 */
1904 if ( (entryp->flags & AIO_WAITING) != 0 ) {
1905 int active_requests;
1906
1907 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1908 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1909
1910 AIO_LOCK;
1911 active_requests = aio_active_requests_for_process( entryp->procp );
1912 //AIO_UNLOCK;
1913 if ( active_requests < 1 ) {
1914 /* no active aio requests for this process, continue exiting */
1915 wakeup_one( &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1916
1917 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1918 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1919 }
1920 AIO_UNLOCK;
1921 return;
1922 }
1923
1924 /*
1925 * aio_suspend case when a signal was not requested. In that scenario we
1926 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1927 * NOTE - the assumption here is that this wakeup call is inexpensive.
1928 * we really only need to do this when an aio_suspend call is pending.
1929 * If we find the wakeup call should be avoided we could mark the
1930 * async IO requests given in the list provided by aio_suspend and only
1931 * call wakeup for them. If we do mark them we should unmark them after
1932 * the aio_suspend wakes up.
1933 */
1934 AIO_LOCK;
1935 wakeup_one( &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1936 AIO_UNLOCK;
1937
1938 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1939 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1940
1941 return;
1942
1943 } /* do_aio_completion */
1944
1945
1946 /*
1947 * aio_last_group_io - checks to see if this is the last unfinished IO request
1948 * for the given group_tag. Returns TRUE if there are no other active IO
1949 * requests for this group or FALSE if the are active IO requests
1950 * NOTE - AIO_LOCK must be held by caller
1951 */
1952
1953 static boolean_t
1954 aio_last_group_io( aio_workq_entry *entryp )
1955 {
1956 aio_workq_entry *my_entryp;
1957
1958 /* look for matches on our queue of active async IO requests */
1959 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1960 if ( my_entryp->group_tag == entryp->group_tag )
1961 return( FALSE );
1962 }
1963
1964 /* look for matches on our queue of asynchronous todo work */
1965 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1966 if ( my_entryp->group_tag == entryp->group_tag )
1967 return( FALSE );
1968 }
1969
1970 /* look for matches on our queue of synchronous todo work */
1971 TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1972 if ( my_entryp->group_tag == entryp->group_tag )
1973 return( FALSE );
1974 }
1975
1976 return( TRUE );
1977
1978 } /* aio_last_group_io */
1979
1980
1981 /*
1982 * do_aio_read
1983 */
1984 static int
1985 do_aio_read( aio_workq_entry *entryp )
1986 {
1987 struct file *fp;
1988 int error;
1989
1990 fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FREAD );
1991 if ( fp != NULL ) {
1992 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
1993 (void *)entryp->aiocb.aio_buf,
1994 entryp->aiocb.aio_nbytes,
1995 entryp->aiocb.aio_offset, FOF_OFFSET,
1996 &entryp->returnval );
1997 frele( fp );
1998 }
1999 else
2000 error = EBADF;
2001
2002 return( error );
2003
2004 } /* do_aio_read */
2005
2006
2007 /*
2008 * do_aio_write
2009 */
2010 static int
2011 do_aio_write( aio_workq_entry *entryp )
2012 {
2013 struct file *fp;
2014 int error;
2015
2016 fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FWRITE );
2017 if ( fp != NULL ) {
2018 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes,
2019 (const void *)entryp->aiocb.aio_buf,
2020 entryp->aiocb.aio_nbytes,
2021 entryp->aiocb.aio_offset, FOF_OFFSET,
2022 &entryp->returnval );
2023 frele( fp );
2024 }
2025 else
2026 error = EBADF;
2027
2028 return( error );
2029
2030 } /* do_aio_write */
2031
2032
2033 /*
2034 * aio_active_requests_for_process - return number of active async IO
2035 * requests for the given process.
2036 * NOTE - caller must hold aio lock!
2037 */
2038
2039 static int
2040 aio_active_requests_for_process( struct proc *procp )
2041 {
2042
2043 return( procp->aio_active_count );
2044
2045 } /* aio_active_requests_for_process */
2046
2047
2048 /*
2049 * do_aio_fsync
2050 */
2051 static int
2052 do_aio_fsync( aio_workq_entry *entryp )
2053 {
2054 register struct vnode *vp;
2055 struct file *fp;
2056 int error;
2057
2058 /*
2059 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2060 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2061 * The following was shamelessly extracted from fsync() implementation.
2062 */
2063 error = getvnode( entryp->procp, entryp->aiocb.aio_fildes, &fp );
2064 if ( error == 0 ) {
2065 vp = (struct vnode *)fp->f_data;
2066 vn_lock( vp, LK_EXCLUSIVE | LK_RETRY, entryp->procp );
2067 error = VOP_FSYNC( vp, fp->f_cred, MNT_WAIT, entryp->procp );
2068 VOP_UNLOCK( vp, 0, entryp->procp );
2069 }
2070 if ( error != 0 )
2071 entryp->returnval = -1;
2072
2073 return( error );
2074
2075 } /* do_aio_fsync */
2076
2077
2078 /*
2079 * is_already_queued - runs through our queues to see if the given
2080 * aiocbp / process is there. Returns TRUE if there is a match
2081 * on any of our aio queues.
2082 * NOTE - callers must hold aio lock!
2083 */
2084
2085 static boolean_t
2086 is_already_queued( struct proc *procp,
2087 struct aiocb *aiocbp )
2088 {
2089 aio_workq_entry *entryp;
2090 boolean_t result;
2091
2092 result = FALSE;
2093
2094 /* look for matches on our queue of async IO requests that have completed */
2095 TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2096 if ( aiocbp == entryp->uaiocbp ) {
2097 result = TRUE;
2098 goto ExitThisRoutine;
2099 }
2100 }
2101
2102 /* look for matches on our queue of active async IO requests */
2103 TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2104 if ( aiocbp == entryp->uaiocbp ) {
2105 result = TRUE;
2106 goto ExitThisRoutine;
2107 }
2108 }
2109
2110 /* look for matches on our queue of asynchronous todo work */
2111 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2112 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2113 result = TRUE;
2114 goto ExitThisRoutine;
2115 }
2116 }
2117
2118 /* look for matches on our queue of synchronous todo work */
2119 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2120 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2121 result = TRUE;
2122 goto ExitThisRoutine;
2123 }
2124 }
2125
2126 ExitThisRoutine:
2127 return( result );
2128
2129 } /* is_already_queued */
2130
2131
2132 /*
2133 * aio initialization
2134 */
2135 __private_extern__ void
2136 aio_init( void )
2137 {
2138 int i;
2139
2140 simple_lock_init( &aio_lock );
2141
2142 AIO_LOCK;
2143 TAILQ_INIT( &aio_anchor.aio_async_workq );
2144 TAILQ_INIT( &aio_anchor.lio_sync_workq );
2145 aio_anchor.aio_async_workq_count = 0;
2146 aio_anchor.lio_sync_workq_count = 0;
2147 aio_anchor.aio_active_count = 0;
2148 aio_anchor.aio_done_count = 0;
2149 AIO_UNLOCK;
2150
2151 i = sizeof( aio_workq_entry );
2152 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2153
2154 _aio_create_worker_threads( aio_worker_threads );
2155
2156 return;
2157
2158 } /* aio_init */
2159
2160
2161 /*
2162 * aio worker threads created here.
2163 */
2164 __private_extern__ void
2165 _aio_create_worker_threads( int num )
2166 {
2167 int i;
2168
2169 /* create some worker threads to handle the async IO requests */
2170 for ( i = 0; i < num; i++ ) {
2171 thread_t myThread;
2172
2173 myThread = kernel_thread( kernel_task, aio_work_thread );
2174 if ( THREAD_NULL == myThread ) {
2175 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2176 }
2177 }
2178
2179 return;
2180
2181 } /* _aio_create_worker_threads */
2182
2183 /*
2184 * Return the current activation utask
2185 */
2186 task_t
2187 get_aiotask(void)
2188 {
2189 return ((struct uthread *)get_bsdthread_info(current_act()))->uu_aio_task;
2190 }