]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_aio.c
xnu-517.11.1.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23
24 /*
25 * todo:
26 * 1) ramesh is looking into how to replace taking a reference on
27 * the user's map (vm_map_reference()) since it is believed that
28 * would not hold the process for us.
29 * 2) david is looking into a way for us to set the priority of the
30 * worker threads to match that of the user's thread when the
31 * async IO was queued.
32 */
33
34
35 /*
36 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
37 */
38
39 #include <sys/systm.h>
40 #include <sys/buf.h>
41 #include <sys/fcntl.h>
42 #include <sys/file.h>
43 #include <sys/filedesc.h>
44 #include <sys/kernel.h>
45 #include <sys/vnode.h>
46 #include <sys/malloc.h>
47 #include <sys/mount.h>
48 #include <sys/param.h>
49 #include <sys/proc.h>
50 #include <sys/sysctl.h>
51 #include <sys/unistd.h>
52 #include <sys/user.h>
53
54 #include <sys/aio_kern.h>
55
56 #include <machine/limits.h>
57 #include <kern/zalloc.h>
58 #include <kern/task.h>
59
60 #include <sys/kdebug.h>
61 #define AIO_work_queued 1
62 #define AIO_worker_wake 2
63 #define AIO_completion_sig 3
64 #define AIO_completion_cleanup_wait 4
65 #define AIO_completion_cleanup_wake 5
66 #define AIO_completion_suspend_wake 6
67 #define AIO_fsync_delay 7
68 #define AIO_cancel 10
69 #define AIO_cancel_async_workq 11
70 #define AIO_cancel_sync_workq 12
71 #define AIO_cancel_activeq 13
72 #define AIO_cancel_doneq 14
73 #define AIO_fsync 20
74 #define AIO_read 30
75 #define AIO_write 40
76 #define AIO_listio 50
77 #define AIO_error 60
78 #define AIO_error_val 61
79 #define AIO_error_activeq 62
80 #define AIO_error_workq 63
81 #define AIO_return 70
82 #define AIO_return_val 71
83 #define AIO_return_activeq 72
84 #define AIO_return_workq 73
85 #define AIO_exec 80
86 #define AIO_exit 90
87 #define AIO_exit_sleep 91
88 #define AIO_close 100
89 #define AIO_close_sleep 101
90 #define AIO_suspend 110
91 #define AIO_suspend_sleep 111
92 #define AIO_worker_thread 120
93
94 #if 0
95 #undef KERNEL_DEBUG
96 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
97 #endif
98
99 /*
100 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
101 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
102 * (proc.aio_activeq) when one of our worker threads start the IO.
103 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
104 * when the IO request completes. The request remains on aio_doneq until
105 * user process calls aio_return or the process exits, either way that is our
106 * trigger to release aio resources.
107 */
108 struct aio_anchor_cb
109 {
110 int aio_async_workq_count; /* entries on aio_async_workq */
111 int lio_sync_workq_count; /* entries on lio_sync_workq */
112 int aio_active_count; /* entries on all active queues (proc.aio_activeq) */
113 int aio_done_count; /* entries on all done queues (proc.aio_doneq) */
114 TAILQ_HEAD( , aio_workq_entry ) aio_async_workq;
115 TAILQ_HEAD( , aio_workq_entry ) lio_sync_workq;
116 };
117 typedef struct aio_anchor_cb aio_anchor_cb;
118
119
120 /*
121 * Notes on aio sleep / wake channels.
122 * We currently pick a couple fields within the proc structure that will allow
123 * us sleep channels that currently do not collide with any other kernel routines.
124 * At this time, for binary compatibility reasons, we cannot create new proc fields.
125 */
126 #define AIO_SUSPEND_SLEEP_CHAN p_estcpu
127 #define AIO_CLEANUP_SLEEP_CHAN p_pctcpu
128
129
130 /*
131 * aysnc IO locking macros used to protect critical sections.
132 */
133 #define AIO_LOCK usimple_lock( &aio_lock )
134 #define AIO_UNLOCK usimple_unlock( &aio_lock )
135
136
137 /*
138 * LOCAL PROTOTYPES
139 */
140 static int aio_active_requests_for_process( struct proc *procp );
141 static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
142 static int aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
143 static int aio_get_all_queues_count( void );
144 static int aio_get_process_count( struct proc *procp );
145 static aio_workq_entry * aio_get_some_work( void );
146 static boolean_t aio_last_group_io( aio_workq_entry *entryp );
147 static void aio_mark_requests( aio_workq_entry *entryp );
148 static int aio_queue_async_request( struct proc *procp,
149 struct aiocb *aiocbp,
150 int kindOfIO );
151 static int aio_validate( aio_workq_entry *entryp );
152 static void aio_work_thread( void );
153 static int do_aio_cancel( struct proc *p,
154 int fd,
155 struct aiocb *aiocbp,
156 boolean_t wait_for_completion,
157 boolean_t disable_notification );
158 static void do_aio_completion( aio_workq_entry *entryp );
159 static int do_aio_fsync( aio_workq_entry *entryp );
160 static int do_aio_read( aio_workq_entry *entryp );
161 static int do_aio_write( aio_workq_entry *entryp );
162 static boolean_t is_already_queued( struct proc *procp,
163 struct aiocb *aiocbp );
164 static int lio_create_async_entry( struct proc *procp,
165 struct aiocb *aiocbp,
166 struct sigevent *sigp,
167 long group_tag,
168 aio_workq_entry **entrypp );
169 static int lio_create_sync_entry( struct proc *procp,
170 struct aiocb *aiocbp,
171 long group_tag,
172 aio_workq_entry **entrypp );
173
174 /*
175 * EXTERNAL PROTOTYPES
176 */
177
178 /* in ...bsd/kern/sys_generic.c */
179 extern struct file* holdfp( struct filedesc* fdp, int fd, int flag );
180 extern int dofileread( struct proc *p, struct file *fp, int fd,
181 void *buf, size_t nbyte, off_t offset,
182 int flags, int *retval );
183 extern int dofilewrite( struct proc *p, struct file *fp, int fd,
184 const void *buf, size_t nbyte, off_t offset,
185 int flags, int *retval );
186 extern vm_map_t vm_map_switch( vm_map_t map );
187
188
189 /*
190 * aio external global variables.
191 */
192 extern int aio_max_requests; /* AIO_MAX - configurable */
193 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
194 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
195
196
197 /*
198 * aio static variables.
199 */
200 static aio_anchor_cb aio_anchor;
201 static simple_lock_data_t aio_lock;
202 static struct zone *aio_workq_zonep;
203
204
205 /*
206 * syscall input parameters
207 */
208 #ifndef _SYS_SYSPROTO_H_
209
210 struct aio_cancel_args {
211 int fd;
212 struct aiocb *aiocbp;
213 };
214
215 struct aio_error_args {
216 struct aiocb *aiocbp;
217 };
218
219 struct aio_fsync_args {
220 int op;
221 struct aiocb *aiocbp;
222 };
223
224 struct aio_read_args {
225 struct aiocb *aiocbp;
226 };
227
228 struct aio_return_args {
229 struct aiocb *aiocbp;
230 };
231
232 struct aio_suspend_args {
233 struct aiocb *const *aiocblist;
234 int nent;
235 const struct timespec *timeoutp;
236 };
237
238 struct aio_write_args {
239 struct aiocb *aiocbp;
240 };
241
242 struct lio_listio_args {
243 int mode;
244 struct aiocb *const *aiocblist;
245 int nent;
246 struct sigevent *sigp;
247 };
248
249 #endif /* _SYS_SYSPROTO_H_ */
250
251
252 /*
253 * aio_cancel - attempt to cancel one or more async IO requests currently
254 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
255 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
256 * is NULL then all outstanding async IO request for the given file
257 * descriptor are cancelled (if possible).
258 */
259
260 int
261 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
262 {
263 struct aiocb my_aiocb;
264 int result;
265 boolean_t funnel_state;
266
267 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
268 (int)p, (int)uap->aiocbp, 0, 0, 0 );
269
270 /* quick check to see if there are any async IO requests queued up */
271 AIO_LOCK;
272 result = aio_get_all_queues_count( );
273 AIO_UNLOCK;
274 if ( result < 1 ) {
275 result = EBADF;
276 goto ExitRoutine;
277 }
278
279 *retval = -1;
280 if ( uap->aiocbp != NULL ) {
281 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
282 if ( result != 0 ) {
283 result = EAGAIN;
284 goto ExitRoutine;
285 }
286
287 /* NOTE - POSIX standard says a mismatch between the file */
288 /* descriptor passed in and the file descriptor embedded in */
289 /* the aiocb causes unspecified results. We return EBADF in */
290 /* that situation. */
291 if ( uap->fd != my_aiocb.aio_fildes ) {
292 result = EBADF;
293 goto ExitRoutine;
294 }
295 }
296
297 /* current BSD code assumes funnel lock is held */
298 funnel_state = thread_funnel_set( kernel_flock, TRUE );
299 result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
300 (void) thread_funnel_set( kernel_flock, funnel_state );
301
302 if ( result != -1 ) {
303 *retval = result;
304 result = 0;
305 goto ExitRoutine;
306 }
307
308 result = EBADF;
309
310 ExitRoutine:
311 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
312 (int)p, (int)uap->aiocbp, result, 0, 0 );
313
314 return( result );
315
316 } /* aio_cancel */
317
318
319 /*
320 * _aio_close - internal function used to clean up async IO requests for
321 * a file descriptor that is closing.
322 * NOTE - kernel funnel lock is held when we get called.
323 * THIS MAY BLOCK.
324 */
325
326 __private_extern__ void
327 _aio_close( struct proc *p, int fd )
328 {
329 int error, count;
330
331 /* quick check to see if there are any async IO requests queued up */
332 AIO_LOCK;
333 count = aio_get_all_queues_count( );
334 AIO_UNLOCK;
335 if ( count < 1 )
336 return;
337
338 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
339 (int)p, fd, 0, 0, 0 );
340
341 /* cancel all async IO requests on our todo queues for this file descriptor */
342 error = do_aio_cancel( p, fd, NULL, TRUE, FALSE );
343 if ( error == AIO_NOTCANCELED ) {
344 /*
345 * AIO_NOTCANCELED is returned when we find an aio request for this process
346 * and file descriptor on the active async IO queue. Active requests cannot
347 * be cancelled so we must wait for them to complete. We will get a special
348 * wake up call on our channel used to sleep for ALL active requests to
349 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
350 * when we must wait for all active aio requests.
351 */
352
353 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
354 (int)p, fd, 0, 0, 0 );
355
356 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
357 }
358
359 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
360 (int)p, fd, 0, 0, 0 );
361
362 return;
363
364 } /* _aio_close */
365
366
367 /*
368 * aio_error - return the error status associated with the async IO
369 * request referred to by uap->aiocbp. The error status is the errno
370 * value that would be set by the corresponding IO request (read, wrtie,
371 * fdatasync, or sync).
372 */
373
374 int
375 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
376 {
377 aio_workq_entry *entryp;
378 int error;
379
380 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
381 (int)p, (int)uap->aiocbp, 0, 0, 0 );
382
383 AIO_LOCK;
384
385 /* quick check to see if there are any async IO requests queued up */
386 if ( aio_get_all_queues_count( ) < 1 ) {
387 error = EINVAL;
388 goto ExitRoutine;
389 }
390
391 /* look for a match on our queue of async IO requests that have completed */
392 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
393 if ( entryp->uaiocbp == uap->aiocbp ) {
394 *retval = entryp->errorval;
395 error = 0;
396 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
397 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
398 goto ExitRoutine;
399 }
400 }
401
402 /* look for a match on our queue of active async IO requests */
403 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
404 if ( entryp->uaiocbp == uap->aiocbp ) {
405 *retval = EINPROGRESS;
406 error = 0;
407 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
408 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
409 goto ExitRoutine;
410 }
411 }
412
413 /* look for a match on our queue of todo work */
414 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
415 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
416 *retval = EINPROGRESS;
417 error = 0;
418 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
419 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
420 goto ExitRoutine;
421 }
422 }
423 error = EINVAL;
424
425 ExitRoutine:
426 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
427 (int)p, (int)uap->aiocbp, error, 0, 0 );
428 AIO_UNLOCK;
429
430 return( error );
431
432 } /* aio_error */
433
434
435 /*
436 * aio_fsync - asynchronously force all IO operations associated
437 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
438 * queued at the time of the call to the synchronized completion state.
439 * NOTE - we do not support op O_DSYNC at this point since we do not support the
440 * fdatasync() call.
441 */
442
443 int
444 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
445 {
446 int error;
447 int fsync_kind;
448
449 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
450 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
451
452 *retval = 0;
453 if ( uap->op == O_SYNC )
454 fsync_kind = AIO_FSYNC;
455 #if 0 // we don't support fdatasync() call yet
456 else if ( uap->op == O_DSYNC )
457 fsync_kind = AIO_DSYNC;
458 #endif
459 else {
460 *retval = -1;
461 error = EINVAL;
462 goto ExitRoutine;
463 }
464
465 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
466 if ( error != 0 )
467 *retval = -1;
468
469 ExitRoutine:
470 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
471 (int)p, (int)uap->aiocbp, error, 0, 0 );
472
473 return( error );
474
475 } /* aio_fsync */
476
477
478 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
479 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
480 * (uap->aiocbp->aio_buf).
481 */
482
483 int
484 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
485 {
486 int error;
487
488 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
489 (int)p, (int)uap->aiocbp, 0, 0, 0 );
490
491 *retval = 0;
492
493 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
494 if ( error != 0 )
495 *retval = -1;
496
497 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
498 (int)p, (int)uap->aiocbp, error, 0, 0 );
499
500 return( error );
501
502 } /* aio_read */
503
504
505 /*
506 * aio_return - return the return status associated with the async IO
507 * request referred to by uap->aiocbp. The return status is the value
508 * that would be returned by corresponding IO request (read, wrtie,
509 * fdatasync, or sync). This is where we release kernel resources
510 * held for async IO call associated with the given aiocb pointer.
511 */
512
513 int
514 aio_return( struct proc *p, struct aio_return_args *uap, register_t *retval )
515 {
516 aio_workq_entry *entryp;
517 int error;
518 boolean_t lock_held;
519
520 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
521 (int)p, (int)uap->aiocbp, 0, 0, 0 );
522
523 AIO_LOCK;
524 lock_held = TRUE;
525 *retval = 0;
526
527 /* quick check to see if there are any async IO requests queued up */
528 if ( aio_get_all_queues_count( ) < 1 ) {
529 error = EINVAL;
530 goto ExitRoutine;
531 }
532
533 /* look for a match on our queue of async IO requests that have completed */
534 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
535 if ( entryp->uaiocbp == uap->aiocbp ) {
536 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
537 aio_anchor.aio_done_count--;
538 p->aio_done_count--;
539
540 *retval = entryp->returnval;
541
542 /* we cannot free requests that are still completing */
543 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
544 vm_map_t my_map;
545
546 my_map = entryp->aio_map;
547 entryp->aio_map = VM_MAP_NULL;
548 AIO_UNLOCK;
549 lock_held = FALSE;
550 aio_free_request( entryp, my_map );
551 }
552 else
553 /* tell completion code to free this request */
554 entryp->flags |= AIO_DO_FREE;
555 error = 0;
556 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
557 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
558 goto ExitRoutine;
559 }
560 }
561
562 /* look for a match on our queue of active async IO requests */
563 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
564 if ( entryp->uaiocbp == uap->aiocbp ) {
565 error = EINPROGRESS;
566 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
567 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
568 goto ExitRoutine;
569 }
570 }
571
572 /* look for a match on our queue of todo work */
573 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
574 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
575 error = EINPROGRESS;
576 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
577 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
578 goto ExitRoutine;
579 }
580 }
581 error = EINVAL;
582
583 ExitRoutine:
584 if ( lock_held )
585 AIO_UNLOCK;
586 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
587 (int)p, (int)uap->aiocbp, error, 0, 0 );
588
589 return( error );
590
591 } /* aio_return */
592
593
594 /*
595 * _aio_exec - internal function used to clean up async IO requests for
596 * a process that is going away due to exec(). We cancel any async IOs
597 * we can and wait for those already active. We also disable signaling
598 * for cancelled or active aio requests that complete.
599 * NOTE - kernel funnel lock is held when we get called.
600 * This routine MAY block!
601 */
602
603 __private_extern__ void
604 _aio_exec( struct proc *p )
605 {
606
607 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
608 (int)p, 0, 0, 0, 0 );
609
610 _aio_exit( p );
611
612 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
613 (int)p, 0, 0, 0, 0 );
614
615 return;
616
617 } /* _aio_exec */
618
619
620 /*
621 * _aio_exit - internal function used to clean up async IO requests for
622 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
623 * we can and wait for those already active. We also disable signaling
624 * for cancelled or active aio requests that complete. This routine MAY block!
625 * NOTE - kernel funnel lock is held when we get called.
626 */
627
628 __private_extern__ void
629 _aio_exit( struct proc *p )
630 {
631 int error, count;
632 aio_workq_entry *entryp;
633
634 /* quick check to see if there are any async IO requests queued up */
635 AIO_LOCK;
636 count = aio_get_all_queues_count( );
637 AIO_UNLOCK;
638 if ( count < 1 ) {
639 return;
640 }
641
642 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
643 (int)p, 0, 0, 0, 0 );
644
645 /*
646 * cancel async IO requests on the todo work queue and wait for those
647 * already active to complete.
648 */
649 error = do_aio_cancel( p, 0, NULL, TRUE, TRUE );
650 if ( error == AIO_NOTCANCELED ) {
651 /*
652 * AIO_NOTCANCELED is returned when we find an aio request for this process
653 * on the active async IO queue. Active requests cannot be cancelled so we
654 * must wait for them to complete. We will get a special wake up call on
655 * our channel used to sleep for ALL active requests to complete. This sleep
656 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
657 * active aio requests.
658 */
659
660 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
661 (int)p, 0, 0, 0, 0 );
662
663 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
664 }
665
666 /* release all aio resources used by this process */
667 AIO_LOCK;
668 entryp = TAILQ_FIRST( &p->aio_doneq );
669 while ( entryp != NULL ) {
670 aio_workq_entry *next_entryp;
671
672 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
673 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
674 aio_anchor.aio_done_count--;
675 p->aio_done_count--;
676
677 /* we cannot free requests that are still completing */
678 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
679 vm_map_t my_map;
680
681 my_map = entryp->aio_map;
682 entryp->aio_map = VM_MAP_NULL;
683 AIO_UNLOCK;
684 aio_free_request( entryp, my_map );
685
686 /* need to start over since aio_doneq may have been */
687 /* changed while we were away. */
688 AIO_LOCK;
689 entryp = TAILQ_FIRST( &p->aio_doneq );
690 continue;
691 }
692 else
693 /* tell completion code to free this request */
694 entryp->flags |= AIO_DO_FREE;
695 entryp = next_entryp;
696 }
697 AIO_UNLOCK;
698
699 ExitRoutine:
700 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
701 (int)p, 0, 0, 0, 0 );
702
703 return;
704
705 } /* _aio_exit */
706
707
708 /*
709 * do_aio_cancel - cancel async IO requests (if possible). We get called by
710 * aio_cancel, close, and at exit.
711 * There are three modes of operation: 1) cancel all async IOs for a process -
712 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
713 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
714 * aiocbp.
715 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
716 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
717 * target async IO requests, and AIO_ALLDONE if all target async IO requests
718 * were already complete.
719 * WARNING - do not deference aiocbp in this routine, it may point to user
720 * land data that has not been copied in (when called from aio_cancel() )
721 * NOTE - kernel funnel lock is held when we get called.
722 */
723
724 static int
725 do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp,
726 boolean_t wait_for_completion, boolean_t disable_notification )
727 {
728 aio_workq_entry *entryp;
729 int result;
730
731 result = -1;
732
733 /* look for a match on our queue of async todo work. */
734 AIO_LOCK;
735 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
736 while ( entryp != NULL ) {
737 aio_workq_entry *next_entryp;
738
739 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
740 if ( p == entryp->procp ) {
741 if ( (aiocbp == NULL && fd == 0) ||
742 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
743 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
744 /* we found a match so we remove the entry from the */
745 /* todo work queue and place it on the done queue */
746 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
747 aio_anchor.aio_async_workq_count--;
748 entryp->errorval = ECANCELED;
749 entryp->returnval = -1;
750 if ( disable_notification )
751 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
752 result = AIO_CANCELED;
753
754 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
755 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
756
757 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
758 aio_anchor.aio_done_count++;
759 p->aio_done_count++;
760 entryp->flags |= AIO_COMPLETION;
761 AIO_UNLOCK;
762
763 /* do completion processing for this request */
764 do_aio_completion( entryp );
765
766 AIO_LOCK;
767 entryp->flags &= ~AIO_COMPLETION;
768 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
769 vm_map_t my_map;
770
771 my_map = entryp->aio_map;
772 entryp->aio_map = VM_MAP_NULL;
773 AIO_UNLOCK;
774 aio_free_request( entryp, my_map );
775 }
776 else
777 AIO_UNLOCK;
778
779 if ( aiocbp != NULL ) {
780 return( result );
781 }
782
783 /* need to start over since aio_async_workq may have been */
784 /* changed while we were away doing completion processing. */
785 AIO_LOCK;
786 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
787 continue;
788 }
789 }
790 entryp = next_entryp;
791 } /* while... */
792
793 /*
794 * look for a match on our queue of synchronous todo work. This will
795 * be a rare occurrence but could happen if a process is terminated while
796 * processing a lio_listio call.
797 */
798 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
799 while ( entryp != NULL ) {
800 aio_workq_entry *next_entryp;
801
802 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
803 if ( p == entryp->procp ) {
804 if ( (aiocbp == NULL && fd == 0) ||
805 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
806 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
807 /* we found a match so we remove the entry from the */
808 /* todo work queue and place it on the done queue */
809 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
810 aio_anchor.lio_sync_workq_count--;
811 entryp->errorval = ECANCELED;
812 entryp->returnval = -1;
813 if ( disable_notification )
814 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
815 result = AIO_CANCELED;
816
817 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
818 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
819
820 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
821 aio_anchor.aio_done_count++;
822 p->aio_done_count++;
823 if ( aiocbp != NULL ) {
824 AIO_UNLOCK;
825 return( result );
826 }
827 }
828 }
829 entryp = next_entryp;
830 } /* while... */
831
832 /*
833 * look for a match on our queue of active async IO requests and
834 * return AIO_NOTCANCELED result.
835 */
836 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
837 if ( (aiocbp == NULL && fd == 0) ||
838 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
839 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
840 result = AIO_NOTCANCELED;
841
842 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
843 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
844
845 if ( wait_for_completion )
846 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
847 if ( disable_notification )
848 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
849 if ( aiocbp != NULL ) {
850 AIO_UNLOCK;
851 return( result );
852 }
853 }
854 }
855
856 /*
857 * if we didn't find any matches on the todo or active queues then look for a
858 * match on our queue of async IO requests that have completed and if found
859 * return AIO_ALLDONE result.
860 */
861 if ( result == -1 ) {
862 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
863 if ( (aiocbp == NULL && fd == 0) ||
864 (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
865 (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
866 result = AIO_ALLDONE;
867
868 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
869 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
870
871 if ( aiocbp != NULL ) {
872 AIO_UNLOCK;
873 return( result );
874 }
875 }
876 }
877 }
878 AIO_UNLOCK;
879
880 return( result );
881
882 } /* do_aio_cancel */
883
884
885 /*
886 * aio_suspend - suspend the calling thread until at least one of the async
887 * IO operations referenced by uap->aiocblist has completed, until a signal
888 * interrupts the function, or uap->timeoutp time interval (optional) has
889 * passed.
890 * Returns 0 if one or more async IOs have completed else -1 and errno is
891 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
892 * woke us up.
893 */
894
895 int
896 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
897 {
898 int error;
899 int i, count;
900 uint64_t abstime;
901 struct timespec ts;
902 struct timeval tv;
903 aio_workq_entry *entryp;
904 struct aiocb * *aiocbpp;
905
906 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
907 (int)p, uap->nent, 0, 0, 0 );
908
909 *retval = -1;
910 abstime = 0;
911 aiocbpp = NULL;
912
913 /* quick check to see if there are any async IO requests queued up */
914 AIO_LOCK;
915 count = aio_get_all_queues_count( );
916 AIO_UNLOCK;
917 if ( count < 1 ) {
918 error = EINVAL;
919 goto ExitThisRoutine;
920 }
921
922 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
923 error = EINVAL;
924 goto ExitThisRoutine;
925 }
926
927 if ( uap->timeoutp != NULL ) {
928 error = copyin( (void *)uap->timeoutp, &ts, sizeof(ts) );
929 if ( error != 0 ) {
930 error = EAGAIN;
931 goto ExitThisRoutine;
932 }
933
934 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
935 error = EINVAL;
936 goto ExitThisRoutine;
937 }
938
939 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
940 &abstime );
941 clock_absolutetime_interval_to_deadline( abstime, &abstime );
942 }
943
944 MALLOC( aiocbpp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
945 if ( aiocbpp == NULL ) {
946 error = EAGAIN;
947 goto ExitThisRoutine;
948 }
949
950 /* check list of aio requests to see if any have completed */
951 for ( i = 0; i < uap->nent; i++ ) {
952 struct aiocb *aiocbp;
953
954 /* copyin in aiocb pointer from list */
955 error = copyin( (void *)(uap->aiocblist + i), (aiocbpp + i), sizeof(aiocbp) );
956 if ( error != 0 ) {
957 error = EAGAIN;
958 goto ExitThisRoutine;
959 }
960
961 /* NULL elements are legal so check for 'em */
962 aiocbp = *(aiocbpp + i);
963 if ( aiocbp == NULL )
964 continue;
965
966 /* return immediately if any aio request in the list is done */
967 AIO_LOCK;
968 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
969 if ( entryp->uaiocbp == aiocbp ) {
970 *retval = 0;
971 error = 0;
972 AIO_UNLOCK;
973 goto ExitThisRoutine;
974 }
975 }
976 AIO_UNLOCK;
977 } /* for ( ; i < uap->nent; ) */
978
979 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
980 (int)p, uap->nent, 0, 0, 0 );
981
982 /*
983 * wait for an async IO to complete or a signal fires or timeout expires.
984 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
985 * interrupts us. If an async IO completes before a signal fires or our
986 * timeout expires, we get a wakeup call from aio_work_thread(). We do not
987 * use tsleep() here in order to avoid getting kernel funnel lock.
988 */
989 assert_wait( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE );
990 if ( abstime > 0 ) {
991 thread_set_timer_deadline( abstime );
992 }
993 error = thread_block( THREAD_CONTINUE_NULL );
994 if ( error == THREAD_AWAKENED ) {
995 /* got our wakeup call from aio_work_thread() */
996 if ( abstime > 0 ) {
997 thread_cancel_timer();
998 }
999 *retval = 0;
1000 error = 0;
1001 }
1002 else if ( error == THREAD_TIMED_OUT ) {
1003 /* our timeout expired */
1004 error = EAGAIN;
1005 }
1006 else {
1007 /* we were interrupted */
1008 if ( abstime > 0 ) {
1009 thread_cancel_timer();
1010 }
1011 error = EINTR;
1012 }
1013
1014 ExitThisRoutine:
1015 if ( aiocbpp != NULL )
1016 FREE( aiocbpp, M_TEMP );
1017
1018 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1019 (int)p, uap->nent, error, 0, 0 );
1020
1021 return( error );
1022
1023 } /* aio_suspend */
1024
1025
1026 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1027 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1028 * (uap->aiocbp->aio_buf).
1029 */
1030
1031 int
1032 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1033 {
1034 int error;
1035
1036 *retval = 0;
1037
1038 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1039 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1040
1041 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1042 if ( error != 0 )
1043 *retval = -1;
1044
1045 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1046 (int)p, (int)uap->aiocbp, error, 0, 0 );
1047
1048 return( error );
1049
1050 } /* aio_write */
1051
1052
1053 /*
1054 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1055 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1056 * The caller gets error and return status for each aiocb in the list via aio_error
1057 * and aio_return. We must keep completed requests until released by the
1058 * aio_return call.
1059 */
1060
1061 int
1062 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1063 {
1064 int i;
1065 int call_result;
1066 int result;
1067 long group_tag;
1068 aio_workq_entry * *entryp_listp;
1069
1070 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1071 (int)p, uap->nent, uap->mode, 0, 0 );
1072
1073 entryp_listp = NULL;
1074 call_result = -1;
1075 *retval = -1;
1076 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1077 call_result = EINVAL;
1078 goto ExitRoutine;
1079 }
1080
1081 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1082 call_result = EINVAL;
1083 goto ExitRoutine;
1084 }
1085
1086 /*
1087 * we use group_tag to mark IO requests for delayed completion processing
1088 * which means we wait until all IO requests in the group have completed
1089 * before we either return to the caller when mode is LIO_WAIT or signal
1090 * user when mode is LIO_NOWAIT.
1091 */
1092 group_tag = random();
1093
1094 /*
1095 * allocate a list of aio_workq_entry pointers that we will use to queue
1096 * up all our requests at once while holding our lock.
1097 */
1098 MALLOC( entryp_listp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
1099 if ( entryp_listp == NULL ) {
1100 call_result = EAGAIN;
1101 goto ExitRoutine;
1102 }
1103
1104 /* process list of aio requests */
1105 for ( i = 0; i < uap->nent; i++ ) {
1106 struct aiocb *my_aiocbp;
1107
1108 *(entryp_listp + i) = NULL;
1109
1110 /* copyin in aiocb pointer from list */
1111 result = copyin( (void *)(uap->aiocblist + i), &my_aiocbp, sizeof(my_aiocbp) );
1112 if ( result != 0 ) {
1113 call_result = EAGAIN;
1114 continue;
1115 }
1116
1117 /* NULL elements are legal so check for 'em */
1118 if ( my_aiocbp == NULL )
1119 continue;
1120
1121 if ( uap->mode == LIO_NOWAIT )
1122 result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1123 group_tag, (entryp_listp + i) );
1124 else
1125 result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1126 (entryp_listp + i) );
1127
1128 if ( result != 0 && call_result == -1 )
1129 call_result = result;
1130 }
1131
1132 /*
1133 * we need to protect this section since we do not want any of these grouped
1134 * IO requests to begin until we have them all on the queue.
1135 */
1136 AIO_LOCK;
1137 for ( i = 0; i < uap->nent; i++ ) {
1138 aio_workq_entry *entryp;
1139
1140 /* NULL elements are legal so check for 'em */
1141 entryp = *(entryp_listp + i);
1142 if ( entryp == NULL )
1143 continue;
1144
1145 /* check our aio limits to throttle bad or rude user land behavior */
1146 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1147 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1148 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1149 vm_map_t my_map;
1150
1151 my_map = entryp->aio_map;
1152 entryp->aio_map = VM_MAP_NULL;
1153 result = EAGAIN;
1154 AIO_UNLOCK;
1155 aio_free_request( entryp, my_map );
1156 AIO_LOCK;
1157 continue;
1158 }
1159
1160 /* place the request on the appropriate queue */
1161 if ( uap->mode == LIO_NOWAIT ) {
1162 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1163 aio_anchor.aio_async_workq_count++;
1164
1165 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1166 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1167 }
1168 else {
1169 TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1170 aio_anchor.lio_sync_workq_count++;
1171 }
1172 }
1173 AIO_UNLOCK;
1174
1175 if ( uap->mode == LIO_NOWAIT )
1176 /* caller does not want to wait so we'll fire off a worker thread and return */
1177 wakeup_one( &aio_anchor.aio_async_workq );
1178 else {
1179 aio_workq_entry *entryp;
1180 int error;
1181
1182 /*
1183 * mode is LIO_WAIT - handle the IO requests now.
1184 */
1185 AIO_LOCK;
1186 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1187 while ( entryp != NULL ) {
1188 if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1189 boolean_t funnel_state;
1190
1191 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1192 aio_anchor.lio_sync_workq_count--;
1193 AIO_UNLOCK;
1194
1195 // file system IO code path requires kernel funnel lock
1196 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1197 if ( (entryp->flags & AIO_READ) != 0 ) {
1198 error = do_aio_read( entryp );
1199 }
1200 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1201 error = do_aio_write( entryp );
1202 }
1203 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1204 error = do_aio_fsync( entryp );
1205 }
1206 else {
1207 printf( "%s - unknown aio request - flags 0x%02X \n",
1208 __FUNCTION__, entryp->flags );
1209 error = EINVAL;
1210 }
1211 entryp->errorval = error;
1212 if ( error != 0 && call_result == -1 )
1213 call_result = EIO;
1214 (void) thread_funnel_set( kernel_flock, funnel_state );
1215
1216 AIO_LOCK;
1217 /* we're done with the IO request so move it on the done queue */
1218 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1219 aio_anchor.aio_done_count++;
1220 p->aio_done_count++;
1221
1222 /* need to start over since lio_sync_workq may have been changed while we */
1223 /* were away doing the IO. */
1224 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1225 continue;
1226 } /* p == entryp->procp */
1227
1228 entryp = TAILQ_NEXT( entryp, aio_workq_link );
1229 } /* while ( entryp != NULL ) */
1230 AIO_UNLOCK;
1231 } /* uap->mode == LIO_WAIT */
1232
1233 /* call_result == -1 means we had no trouble queueing up requests */
1234 if ( call_result == -1 ) {
1235 call_result = 0;
1236 *retval = 0;
1237 }
1238
1239 ExitRoutine:
1240 if ( entryp_listp != NULL )
1241 FREE( entryp_listp, M_TEMP );
1242
1243 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1244 (int)p, call_result, 0, 0, 0 );
1245
1246 return( call_result );
1247
1248 } /* lio_listio */
1249
1250
1251 /*
1252 * aio worker thread. this is where all the real work gets done.
1253 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1254 * after new work is queued up.
1255 */
1256
1257 static void
1258 aio_work_thread( void )
1259 {
1260 aio_workq_entry *entryp;
1261 struct uthread *uthread = (struct uthread *)get_bsdthread_info(current_act());
1262
1263 for( ;; ) {
1264 entryp = aio_get_some_work();
1265 if ( entryp == NULL ) {
1266 /*
1267 * aio worker threads wait for some work to get queued up
1268 * by aio_queue_async_request. Once some work gets queued
1269 * it will wake up one of these worker threads just before
1270 * returning to our caller in user land. We do not use
1271 * tsleep() here in order to avoid getting kernel funnel lock.
1272 */
1273 assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1274 thread_block( THREAD_CONTINUE_NULL );
1275
1276 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_wake)) | DBG_FUNC_NONE,
1277 0, 0, 0, 0, 0 );
1278 }
1279 else {
1280 int error;
1281 boolean_t funnel_state;
1282 vm_map_t currentmap;
1283 vm_map_t oldmap = VM_MAP_NULL;
1284 task_t oldaiotask = TASK_NULL;
1285
1286 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1287 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1288
1289 /*
1290 * Assume the target's address space identity for the duration
1291 * of the IO.
1292 */
1293 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1294
1295 currentmap = get_task_map( (current_proc())->task );
1296 if ( currentmap != entryp->aio_map ) {
1297 oldaiotask = uthread->uu_aio_task;
1298 uthread->uu_aio_task = entryp->procp->task;
1299 oldmap = vm_map_switch( entryp->aio_map );
1300 }
1301
1302 if ( (entryp->flags & AIO_READ) != 0 ) {
1303 error = do_aio_read( entryp );
1304 }
1305 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1306 error = do_aio_write( entryp );
1307 }
1308 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1309 error = do_aio_fsync( entryp );
1310 }
1311 else {
1312 printf( "%s - unknown aio request - flags 0x%02X \n",
1313 __FUNCTION__, entryp->flags );
1314 error = EINVAL;
1315 }
1316 entryp->errorval = error;
1317 if ( currentmap != entryp->aio_map ) {
1318 (void) vm_map_switch( oldmap );
1319 uthread->uu_aio_task = oldaiotask;
1320 }
1321
1322 /* we're done with the IO request so pop it off the active queue and */
1323 /* push it on the done queue */
1324 AIO_LOCK;
1325 TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1326 aio_anchor.aio_active_count--;
1327 entryp->procp->aio_active_count--;
1328 TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1329 aio_anchor.aio_done_count++;
1330 entryp->procp->aio_done_count++;
1331 entryp->flags |= AIO_COMPLETION;
1332
1333 /* remove our reference to the user land map. */
1334 if ( VM_MAP_NULL != entryp->aio_map ) {
1335 vm_map_t my_map;
1336
1337 my_map = entryp->aio_map;
1338 entryp->aio_map = VM_MAP_NULL;
1339 AIO_UNLOCK; /* must unlock before calling vm_map_deallocate() */
1340 vm_map_deallocate( my_map );
1341 }
1342 else {
1343 AIO_UNLOCK;
1344 }
1345
1346 do_aio_completion( entryp );
1347 (void) thread_funnel_set( kernel_flock, funnel_state );
1348
1349 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1350 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1351 entryp->returnval, 0 );
1352
1353 AIO_LOCK;
1354 entryp->flags &= ~AIO_COMPLETION;
1355 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1356 vm_map_t my_map;
1357
1358 my_map = entryp->aio_map;
1359 entryp->aio_map = VM_MAP_NULL;
1360 AIO_UNLOCK;
1361 aio_free_request( entryp, my_map );
1362 }
1363 else
1364 AIO_UNLOCK;
1365 }
1366 } /* for ( ;; ) */
1367
1368 /* NOT REACHED */
1369
1370 } /* aio_work_thread */
1371
1372
1373 /*
1374 * aio_get_some_work - get the next async IO request that is ready to be executed.
1375 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1376 * IO requests at the time the aio_fsync call came in have completed.
1377 */
1378
1379 static aio_workq_entry *
1380 aio_get_some_work( void )
1381 {
1382 aio_workq_entry *entryp;
1383 int skip_count = 0;
1384
1385 /* pop some work off the work queue and add to our active queue */
1386 AIO_LOCK;
1387 for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1388 entryp != NULL;
1389 entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1390
1391 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1392 /* leave aio_fsync calls on the work queue if there are IO */
1393 /* requests on the active queue for the same file descriptor. */
1394 if ( aio_delay_fsync_request( entryp ) ) {
1395
1396 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1397 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1398 continue;
1399 }
1400 }
1401 break;
1402 }
1403
1404 if ( entryp != NULL ) {
1405 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1406 aio_anchor.aio_async_workq_count--;
1407 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1408 aio_anchor.aio_active_count++;
1409 entryp->procp->aio_active_count++;
1410 }
1411 AIO_UNLOCK;
1412
1413 return( entryp );
1414
1415 } /* aio_get_some_work */
1416
1417
1418 /*
1419 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1420 * this time. Delay will happen when there are any active IOs for the same file
1421 * descriptor that were queued at time the aio_sync call was queued.
1422 * NOTE - AIO_LOCK must be held by caller
1423 */
1424 static boolean_t
1425 aio_delay_fsync_request( aio_workq_entry *entryp )
1426 {
1427 aio_workq_entry *my_entryp;
1428
1429 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1430 if ( my_entryp->fsyncp != NULL &&
1431 entryp->uaiocbp == my_entryp->fsyncp &&
1432 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1433 return( TRUE );
1434 }
1435 }
1436
1437 return( FALSE );
1438
1439 } /* aio_delay_fsync_request */
1440
1441
1442 /*
1443 * aio_queue_async_request - queue up an async IO request on our work queue then
1444 * wake up one of our worker threads to do the actual work. We get a reference
1445 * to our caller's user land map in order to keep it around while we are
1446 * processing the request.
1447 */
1448
1449 static int
1450 aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO )
1451 {
1452 aio_workq_entry *entryp;
1453 int result;
1454
1455 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1456 if ( entryp == NULL ) {
1457 result = EAGAIN;
1458 goto error_exit;
1459 }
1460 bzero( entryp, sizeof(*entryp) );
1461
1462 /* fill in the rest of the aio_workq_entry */
1463 entryp->procp = procp;
1464 entryp->uaiocbp = aiocbp;
1465 entryp->flags |= kindOfIO;
1466 entryp->aio_map = VM_MAP_NULL;
1467 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1468 if ( result != 0 ) {
1469 result = EAGAIN;
1470 goto error_exit;
1471 }
1472
1473 /* do some more validation on the aiocb and embedded file descriptor */
1474 result = aio_validate( entryp );
1475 if ( result != 0 )
1476 goto error_exit;
1477
1478 /* get a reference to the user land map in order to keep it around */
1479 entryp->aio_map = get_task_map( procp->task );
1480 vm_map_reference( entryp->aio_map );
1481
1482 AIO_LOCK;
1483
1484 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1485 AIO_UNLOCK;
1486 result = EAGAIN;
1487 goto error_exit;
1488 }
1489
1490 /* check our aio limits to throttle bad or rude user land behavior */
1491 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1492 aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1493 AIO_UNLOCK;
1494 result = EAGAIN;
1495 goto error_exit;
1496 }
1497
1498 /*
1499 * aio_fsync calls sync up all async IO requests queued at the time
1500 * the aio_fsync call was made. So we mark each currently queued async
1501 * IO with a matching file descriptor as must complete before we do the
1502 * fsync. We set the fsyncp field of each matching async IO
1503 * request with the aiocb pointer passed in on the aio_fsync call to
1504 * know which IOs must complete before we process the aio_fsync call.
1505 */
1506 if ( (kindOfIO & AIO_FSYNC) != 0 )
1507 aio_mark_requests( entryp );
1508
1509 /* queue up on our aio asynchronous work queue */
1510 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1511 aio_anchor.aio_async_workq_count++;
1512
1513 AIO_UNLOCK;
1514
1515 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1516 (int)procp, (int)aiocbp, 0, 0, 0 );
1517
1518 wakeup_one( &aio_anchor.aio_async_workq );
1519
1520 return( 0 );
1521
1522 error_exit:
1523 if ( entryp != NULL ) {
1524 /* this entry has not been queued up so no worries about unlocked */
1525 /* state and aio_map */
1526 aio_free_request( entryp, entryp->aio_map );
1527 }
1528
1529 return( result );
1530
1531 } /* aio_queue_async_request */
1532
1533
1534 /*
1535 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1536 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1537 * our caller. We get a reference to our caller's user land map in order to keep
1538 * it around while we are processing the request.
1539 * lio_listio calls behave differently at completion they do completion notification
1540 * when all async IO requests have completed. We use group_tag to tag IO requests
1541 * that behave in the delay notification manner.
1542 */
1543
1544 static int
1545 lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp,
1546 struct sigevent *sigp, long group_tag,
1547 aio_workq_entry **entrypp )
1548 {
1549 aio_workq_entry *entryp;
1550 int result;
1551
1552 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1553 if ( entryp == NULL ) {
1554 result = EAGAIN;
1555 goto error_exit;
1556 }
1557 bzero( entryp, sizeof(*entryp) );
1558
1559 /* fill in the rest of the aio_workq_entry */
1560 entryp->procp = procp;
1561 entryp->uaiocbp = aiocbp;
1562 entryp->flags |= AIO_LIO;
1563 entryp->group_tag = group_tag;
1564 entryp->aio_map = VM_MAP_NULL;
1565 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1566 if ( result != 0 ) {
1567 result = EAGAIN;
1568 goto error_exit;
1569 }
1570
1571 /* look for lio_listio LIO_NOP requests and ignore them. */
1572 /* Not really an error, but we need to free our aio_workq_entry. */
1573 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1574 result = 0;
1575 goto error_exit;
1576 }
1577
1578 /* use sigevent passed in to lio_listio for each of our calls, but only */
1579 /* do completion notification after the last request completes. */
1580 if ( sigp != NULL ) {
1581 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1582 if ( result != 0 ) {
1583 result = EAGAIN;
1584 goto error_exit;
1585 }
1586 }
1587
1588 /* do some more validation on the aiocb and embedded file descriptor */
1589 result = aio_validate( entryp );
1590 if ( result != 0 )
1591 goto error_exit;
1592
1593 /* get a reference to the user land map in order to keep it around */
1594 entryp->aio_map = get_task_map( procp->task );
1595 vm_map_reference( entryp->aio_map );
1596
1597 *entrypp = entryp;
1598 return( 0 );
1599
1600 error_exit:
1601 if ( entryp != NULL )
1602 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1603
1604 return( result );
1605
1606 } /* lio_create_async_entry */
1607
1608
1609 /*
1610 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1611 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1612 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1613 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1614 * NOTE - AIO_LOCK must be held by caller
1615 */
1616
1617 static void
1618 aio_mark_requests( aio_workq_entry *entryp )
1619 {
1620 aio_workq_entry *my_entryp;
1621
1622 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1623 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1624 my_entryp->fsyncp = entryp->uaiocbp;
1625 }
1626 }
1627
1628 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1629 if ( entryp->procp == my_entryp->procp &&
1630 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1631 my_entryp->fsyncp = entryp->uaiocbp;
1632 }
1633 }
1634
1635 } /* aio_mark_requests */
1636
1637
1638 /*
1639 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1640 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1641 * our caller.
1642 * lio_listio calls behave differently at completion they do completion notification
1643 * when all async IO requests have completed. We use group_tag to tag IO requests
1644 * that behave in the delay notification manner.
1645 */
1646
1647 static int
1648 lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp,
1649 long group_tag, aio_workq_entry **entrypp )
1650 {
1651 aio_workq_entry *entryp;
1652 int result;
1653
1654 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1655 if ( entryp == NULL ) {
1656 result = EAGAIN;
1657 goto error_exit;
1658 }
1659 bzero( entryp, sizeof(*entryp) );
1660
1661 /* fill in the rest of the aio_workq_entry */
1662 entryp->procp = procp;
1663 entryp->uaiocbp = aiocbp;
1664 entryp->flags |= AIO_LIO;
1665 entryp->group_tag = group_tag;
1666 entryp->aio_map = VM_MAP_NULL;
1667 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1668 if ( result != 0 ) {
1669 result = EAGAIN;
1670 goto error_exit;
1671 }
1672
1673 /* look for lio_listio LIO_NOP requests and ignore them. */
1674 /* Not really an error, but we need to free our aio_workq_entry. */
1675 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1676 result = 0;
1677 goto error_exit;
1678 }
1679
1680 result = aio_validate( entryp );
1681 if ( result != 0 ) {
1682 goto error_exit;
1683 }
1684
1685 *entrypp = entryp;
1686 return( 0 );
1687
1688 error_exit:
1689 if ( entryp != NULL )
1690 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1691
1692 return( result );
1693
1694 } /* lio_create_sync_entry */
1695
1696
1697 /*
1698 * aio_free_request - remove our reference on the user land map and
1699 * free the work queue entry resources.
1700 * We are not holding the lock here thus aio_map is passed in and
1701 * zeroed while we did have the lock.
1702 */
1703
1704 static int
1705 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1706 {
1707 /* remove our reference to the user land map. */
1708 if ( VM_MAP_NULL != the_map ) {
1709 vm_map_deallocate( the_map );
1710 }
1711
1712 zfree( aio_workq_zonep, (vm_offset_t) entryp );
1713
1714 return( 0 );
1715
1716 } /* aio_free_request */
1717
1718
1719 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1720 */
1721
1722 static int
1723 aio_validate( aio_workq_entry *entryp )
1724 {
1725 boolean_t funnel_state;
1726 struct file *fp;
1727 int flag;
1728 int result;
1729
1730 result = 0;
1731
1732 if ( (entryp->flags & AIO_LIO) != 0 ) {
1733 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1734 entryp->flags |= AIO_READ;
1735 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1736 entryp->flags |= AIO_WRITE;
1737 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1738 return( 0 );
1739 else
1740 return( EINVAL );
1741 }
1742
1743 flag = FREAD;
1744 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1745 flag = FWRITE;
1746 }
1747
1748 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1749 if ( entryp->aiocb.aio_offset < 0 ||
1750 entryp->aiocb.aio_nbytes < 0 ||
1751 entryp->aiocb.aio_nbytes > INT_MAX ||
1752 entryp->aiocb.aio_buf == NULL )
1753 return( EINVAL );
1754 }
1755
1756 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1757 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1758 * sigev_notify_function, and sigev_notify_attributes are ignored.
1759 */
1760 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1761 int signum;
1762 /* make sure we have a valid signal number */
1763 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1764 if ( signum <= 0 || signum >= NSIG ||
1765 signum == SIGKILL || signum == SIGSTOP )
1766 return (EINVAL);
1767 }
1768 else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1769 return (EINVAL);
1770
1771 /* validate the file descriptor and that the file was opened
1772 * for the appropriate read / write access. This section requires
1773 * kernel funnel lock.
1774 */
1775 funnel_state = thread_funnel_set( kernel_flock, TRUE );
1776
1777 result = fdgetf( entryp->procp, entryp->aiocb.aio_fildes, &fp );
1778 if ( result == 0 ) {
1779 if ( (fp->f_flag & flag) == 0 ) {
1780 /* we don't have read or write access */
1781 result = EBADF;
1782 }
1783 else if ( fp->f_type != DTYPE_VNODE ) {
1784 /* this is not a file */
1785 result = ESPIPE;
1786 }
1787 }
1788 else {
1789 result = EBADF;
1790 }
1791
1792 (void) thread_funnel_set( kernel_flock, funnel_state );
1793
1794 return( result );
1795
1796 } /* aio_validate */
1797
1798
1799 /*
1800 * aio_get_process_count - runs through our queues that hold outstanding
1801 * async IO reqests and totals up number of requests for the given
1802 * process.
1803 * NOTE - caller must hold aio lock!
1804 */
1805
1806 static int
1807 aio_get_process_count( struct proc *procp )
1808 {
1809 aio_workq_entry *entryp;
1810 int error;
1811 int count;
1812
1813 /* begin with count of completed async IO requests for this process */
1814 count = procp->aio_done_count;
1815
1816 /* add in count of active async IO requests for this process */
1817 count += procp->aio_active_count;
1818
1819 /* look for matches on our queue of asynchronous todo work */
1820 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1821 if ( procp == entryp->procp ) {
1822 count++;
1823 }
1824 }
1825
1826 /* look for matches on our queue of synchronous todo work */
1827 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1828 if ( procp == entryp->procp ) {
1829 count++;
1830 }
1831 }
1832
1833 return( count );
1834
1835 } /* aio_get_process_count */
1836
1837
1838 /*
1839 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1840 * NOTE - caller must hold aio lock!
1841 */
1842
1843 static int
1844 aio_get_all_queues_count( void )
1845 {
1846 int count;
1847
1848 count = aio_anchor.aio_async_workq_count;
1849 count += aio_anchor.lio_sync_workq_count;
1850 count += aio_anchor.aio_active_count;
1851 count += aio_anchor.aio_done_count;
1852
1853 return( count );
1854
1855 } /* aio_get_all_queues_count */
1856
1857
1858 /*
1859 * do_aio_completion. Handle async IO completion.
1860 */
1861
1862 static void
1863 do_aio_completion( aio_workq_entry *entryp )
1864 {
1865 /* signal user land process if appropriate */
1866 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1867 (entryp->flags & AIO_DISABLE) == 0 ) {
1868
1869 /*
1870 * if group_tag is non zero then make sure this is the last IO request
1871 * in the group before we signal.
1872 */
1873 if ( entryp->group_tag == 0 ||
1874 (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1875 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1876 (int)entryp->procp, (int)entryp->uaiocbp,
1877 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1878
1879 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1880 return;
1881 }
1882 }
1883
1884 /*
1885 * need to handle case where a process is trying to exit, exec, or close
1886 * and is currently waiting for active aio requests to complete. If
1887 * AIO_WAITING is set then we need to look to see if there are any
1888 * other requests in the active queue for this process. If there are
1889 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1890 * there are some still active then do nothing - we only want to wakeup
1891 * when all active aio requests for the process are complete.
1892 */
1893 if ( (entryp->flags & AIO_WAITING) != 0 ) {
1894 int active_requests;
1895
1896 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1897 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1898
1899 AIO_LOCK;
1900 active_requests = aio_active_requests_for_process( entryp->procp );
1901 AIO_UNLOCK;
1902 if ( active_requests < 1 ) {
1903 /* no active aio requests for this process, continue exiting */
1904
1905 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1906 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1907
1908 wakeup_one( &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1909 }
1910 return;
1911 }
1912
1913 /*
1914 * aio_suspend case when a signal was not requested. In that scenario we
1915 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1916 * NOTE - the assumption here is that this wakeup call is inexpensive.
1917 * we really only need to do this when an aio_suspend call is pending.
1918 * If we find the wakeup call should be avoided we could mark the
1919 * async IO requests given in the list provided by aio_suspend and only
1920 * call wakeup for them. If we do mark them we should unmark them after
1921 * the aio_suspend wakes up.
1922 */
1923 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1924 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1925
1926 wakeup_one( &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1927
1928 return;
1929
1930 } /* do_aio_completion */
1931
1932
1933 /*
1934 * aio_last_group_io - checks to see if this is the last unfinished IO request
1935 * for the given group_tag. Returns TRUE if there are no other active IO
1936 * requests for this group or FALSE if the are active IO requests
1937 * NOTE - AIO_LOCK must be held by caller
1938 */
1939
1940 static boolean_t
1941 aio_last_group_io( aio_workq_entry *entryp )
1942 {
1943 aio_workq_entry *my_entryp;
1944
1945 /* look for matches on our queue of active async IO requests */
1946 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1947 if ( my_entryp->group_tag == entryp->group_tag )
1948 return( FALSE );
1949 }
1950
1951 /* look for matches on our queue of asynchronous todo work */
1952 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1953 if ( my_entryp->group_tag == entryp->group_tag )
1954 return( FALSE );
1955 }
1956
1957 /* look for matches on our queue of synchronous todo work */
1958 TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1959 if ( my_entryp->group_tag == entryp->group_tag )
1960 return( FALSE );
1961 }
1962
1963 return( TRUE );
1964
1965 } /* aio_last_group_io */
1966
1967
1968 /*
1969 * do_aio_read
1970 */
1971 static int
1972 do_aio_read( aio_workq_entry *entryp )
1973 {
1974 struct file *fp;
1975 int error;
1976
1977 fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FREAD );
1978 if ( fp != NULL ) {
1979 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
1980 (void *)entryp->aiocb.aio_buf,
1981 entryp->aiocb.aio_nbytes,
1982 entryp->aiocb.aio_offset, FOF_OFFSET,
1983 &entryp->returnval );
1984 frele( fp );
1985 }
1986 else
1987 error = EBADF;
1988
1989 return( error );
1990
1991 } /* do_aio_read */
1992
1993
1994 /*
1995 * do_aio_write
1996 */
1997 static int
1998 do_aio_write( aio_workq_entry *entryp )
1999 {
2000 struct file *fp;
2001 int error;
2002
2003 fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FWRITE );
2004 if ( fp != NULL ) {
2005 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes,
2006 (const void *)entryp->aiocb.aio_buf,
2007 entryp->aiocb.aio_nbytes,
2008 entryp->aiocb.aio_offset, FOF_OFFSET,
2009 &entryp->returnval );
2010 frele( fp );
2011 }
2012 else
2013 error = EBADF;
2014
2015 return( error );
2016
2017 } /* do_aio_write */
2018
2019
2020 /*
2021 * aio_active_requests_for_process - return number of active async IO
2022 * requests for the given process.
2023 * NOTE - caller must hold aio lock!
2024 */
2025
2026 static int
2027 aio_active_requests_for_process( struct proc *procp )
2028 {
2029
2030 return( procp->aio_active_count );
2031
2032 } /* aio_active_requests_for_process */
2033
2034
2035 /*
2036 * do_aio_fsync
2037 */
2038 static int
2039 do_aio_fsync( aio_workq_entry *entryp )
2040 {
2041 register struct vnode *vp;
2042 struct file *fp;
2043 int error;
2044
2045 /*
2046 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2047 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2048 * The following was shamelessly extracted from fsync() implementation.
2049 */
2050 error = getvnode( entryp->procp, entryp->aiocb.aio_fildes, &fp );
2051 if ( error == 0 ) {
2052 vp = (struct vnode *)fp->f_data;
2053 vn_lock( vp, LK_EXCLUSIVE | LK_RETRY, entryp->procp );
2054 error = VOP_FSYNC( vp, fp->f_cred, MNT_WAIT, entryp->procp );
2055 VOP_UNLOCK( vp, 0, entryp->procp );
2056 }
2057 if ( error != 0 )
2058 entryp->returnval = -1;
2059
2060 return( error );
2061
2062 } /* do_aio_fsync */
2063
2064
2065 /*
2066 * is_already_queued - runs through our queues to see if the given
2067 * aiocbp / process is there. Returns TRUE if there is a match
2068 * on any of our aio queues.
2069 * NOTE - callers must hold aio lock!
2070 */
2071
2072 static boolean_t
2073 is_already_queued( struct proc *procp,
2074 struct aiocb *aiocbp )
2075 {
2076 aio_workq_entry *entryp;
2077 boolean_t result;
2078
2079 result = FALSE;
2080
2081 /* look for matches on our queue of async IO requests that have completed */
2082 TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2083 if ( aiocbp == entryp->uaiocbp ) {
2084 result = TRUE;
2085 goto ExitThisRoutine;
2086 }
2087 }
2088
2089 /* look for matches on our queue of active async IO requests */
2090 TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2091 if ( aiocbp == entryp->uaiocbp ) {
2092 result = TRUE;
2093 goto ExitThisRoutine;
2094 }
2095 }
2096
2097 /* look for matches on our queue of asynchronous todo work */
2098 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2099 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2100 result = TRUE;
2101 goto ExitThisRoutine;
2102 }
2103 }
2104
2105 /* look for matches on our queue of synchronous todo work */
2106 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2107 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2108 result = TRUE;
2109 goto ExitThisRoutine;
2110 }
2111 }
2112
2113 ExitThisRoutine:
2114 return( result );
2115
2116 } /* is_already_queued */
2117
2118
2119 /*
2120 * aio initialization
2121 */
2122 __private_extern__ void
2123 aio_init( void )
2124 {
2125 int i;
2126
2127 simple_lock_init( &aio_lock );
2128
2129 AIO_LOCK;
2130 TAILQ_INIT( &aio_anchor.aio_async_workq );
2131 TAILQ_INIT( &aio_anchor.lio_sync_workq );
2132 aio_anchor.aio_async_workq_count = 0;
2133 aio_anchor.lio_sync_workq_count = 0;
2134 aio_anchor.aio_active_count = 0;
2135 aio_anchor.aio_done_count = 0;
2136 AIO_UNLOCK;
2137
2138 i = sizeof( aio_workq_entry );
2139 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2140
2141 _aio_create_worker_threads( aio_worker_threads );
2142
2143 return;
2144
2145 } /* aio_init */
2146
2147
2148 /*
2149 * aio worker threads created here.
2150 */
2151 __private_extern__ void
2152 _aio_create_worker_threads( int num )
2153 {
2154 int i;
2155
2156 /* create some worker threads to handle the async IO requests */
2157 for ( i = 0; i < num; i++ ) {
2158 thread_t myThread;
2159
2160 myThread = kernel_thread( kernel_task, aio_work_thread );
2161 if ( THREAD_NULL == myThread ) {
2162 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2163 }
2164 }
2165
2166 return;
2167
2168 } /* _aio_create_worker_threads */
2169
2170 /*
2171 * Return the current activation utask
2172 */
2173 task_t
2174 get_aiotask(void)
2175 {
2176 return ((struct uthread *)get_bsdthread_info(current_act()))->uu_aio_task;
2177 }