]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_aio.c
7bc446c56d880d2260cdc7ab90789cfa882a6b6f
[apple/xnu.git] / bsd / kern / kern_aio.c
1 /*
2 * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25 /*
26 * todo:
27 * 1) ramesh is looking into how to replace taking a reference on
28 * the user's map (vm_map_reference()) since it is believed that
29 * would not hold the process for us.
30 * 2) david is looking into a way for us to set the priority of the
31 * worker threads to match that of the user's thread when the
32 * async IO was queued.
33 */
34
35
36 /*
37 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
38 */
39
40 #include <sys/systm.h>
41 #include <sys/fcntl.h>
42 #include <sys/file_internal.h>
43 #include <sys/filedesc.h>
44 #include <sys/kernel.h>
45 #include <sys/vnode_internal.h>
46 #include <sys/malloc.h>
47 #include <sys/mount_internal.h>
48 #include <sys/param.h>
49 #include <sys/proc_internal.h>
50 #include <sys/sysctl.h>
51 #include <sys/unistd.h>
52 #include <sys/user.h>
53
54 #include <sys/aio_kern.h>
55 #include <sys/sysproto.h>
56
57 #include <machine/limits.h>
58
59 #include <mach/mach_types.h>
60 #include <kern/kern_types.h>
61 #include <kern/zalloc.h>
62 #include <kern/task.h>
63 #include <kern/sched_prim.h>
64
65 #include <vm/vm_map.h>
66
67 #include <sys/kdebug.h>
68 #define AIO_work_queued 1
69 #define AIO_worker_wake 2
70 #define AIO_completion_sig 3
71 #define AIO_completion_cleanup_wait 4
72 #define AIO_completion_cleanup_wake 5
73 #define AIO_completion_suspend_wake 6
74 #define AIO_fsync_delay 7
75 #define AIO_cancel 10
76 #define AIO_cancel_async_workq 11
77 #define AIO_cancel_sync_workq 12
78 #define AIO_cancel_activeq 13
79 #define AIO_cancel_doneq 14
80 #define AIO_fsync 20
81 #define AIO_read 30
82 #define AIO_write 40
83 #define AIO_listio 50
84 #define AIO_error 60
85 #define AIO_error_val 61
86 #define AIO_error_activeq 62
87 #define AIO_error_workq 63
88 #define AIO_return 70
89 #define AIO_return_val 71
90 #define AIO_return_activeq 72
91 #define AIO_return_workq 73
92 #define AIO_exec 80
93 #define AIO_exit 90
94 #define AIO_exit_sleep 91
95 #define AIO_close 100
96 #define AIO_close_sleep 101
97 #define AIO_suspend 110
98 #define AIO_suspend_sleep 111
99 #define AIO_worker_thread 120
100
101 #if 0
102 #undef KERNEL_DEBUG
103 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
104 #endif
105
106 /*
107 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
108 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
109 * (proc.aio_activeq) when one of our worker threads start the IO.
110 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
111 * when the IO request completes. The request remains on aio_doneq until
112 * user process calls aio_return or the process exits, either way that is our
113 * trigger to release aio resources.
114 */
115 struct aio_anchor_cb
116 {
117 int aio_async_workq_count; /* entries on aio_async_workq */
118 int lio_sync_workq_count; /* entries on lio_sync_workq */
119 int aio_active_count; /* entries on all active queues (proc.aio_activeq) */
120 int aio_done_count; /* entries on all done queues (proc.aio_doneq) */
121 TAILQ_HEAD( , aio_workq_entry ) aio_async_workq;
122 TAILQ_HEAD( , aio_workq_entry ) lio_sync_workq;
123 };
124 typedef struct aio_anchor_cb aio_anchor_cb;
125
126
127 /*
128 * Notes on aio sleep / wake channels.
129 * We currently pick a couple fields within the proc structure that will allow
130 * us sleep channels that currently do not collide with any other kernel routines.
131 * At this time, for binary compatibility reasons, we cannot create new proc fields.
132 */
133 #define AIO_SUSPEND_SLEEP_CHAN p_estcpu
134 #define AIO_CLEANUP_SLEEP_CHAN p_pctcpu
135
136
137 /*
138 * aysnc IO locking macros used to protect critical sections.
139 */
140 #define AIO_LOCK lck_mtx_lock(aio_lock)
141 #define AIO_UNLOCK lck_mtx_unlock(aio_lock)
142
143
144 /*
145 * LOCAL PROTOTYPES
146 */
147 static int aio_active_requests_for_process( struct proc *procp );
148 static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
149 static int aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
150 static int aio_get_all_queues_count( void );
151 static int aio_get_process_count( struct proc *procp );
152 static aio_workq_entry * aio_get_some_work( void );
153 static boolean_t aio_last_group_io( aio_workq_entry *entryp );
154 static void aio_mark_requests( aio_workq_entry *entryp );
155 static int aio_queue_async_request( struct proc *procp,
156 user_addr_t aiocbp,
157 int kindOfIO );
158 static int aio_validate( aio_workq_entry *entryp );
159 static void aio_work_thread( void );
160 static int do_aio_cancel( struct proc *p,
161 int fd,
162 user_addr_t aiocbp,
163 boolean_t wait_for_completion,
164 boolean_t disable_notification );
165 static void do_aio_completion( aio_workq_entry *entryp );
166 static int do_aio_fsync( aio_workq_entry *entryp );
167 static int do_aio_read( aio_workq_entry *entryp );
168 static int do_aio_write( aio_workq_entry *entryp );
169 static void do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
170 static boolean_t is_already_queued( struct proc *procp,
171 user_addr_t aiocbp );
172 static int lio_create_async_entry( struct proc *procp,
173 user_addr_t aiocbp,
174 user_addr_t sigp,
175 long group_tag,
176 aio_workq_entry **entrypp );
177 static int lio_create_sync_entry( struct proc *procp,
178 user_addr_t aiocbp,
179 long group_tag,
180 aio_workq_entry **entrypp );
181
182
183 /*
184 * EXTERNAL PROTOTYPES
185 */
186
187 /* in ...bsd/kern/sys_generic.c */
188 extern int dofileread( struct proc *p, struct fileproc *fp, int fd,
189 user_addr_t bufp, user_size_t nbyte,
190 off_t offset, int flags, user_ssize_t *retval );
191 extern int dofilewrite( struct proc *p, struct fileproc *fp, int fd,
192 user_addr_t bufp, user_size_t nbyte, off_t offset,
193 int flags, user_ssize_t *retval );
194
195 /*
196 * aio external global variables.
197 */
198 extern int aio_max_requests; /* AIO_MAX - configurable */
199 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
200 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
201
202
203 /*
204 * aio static variables.
205 */
206 static aio_anchor_cb aio_anchor;
207 static lck_mtx_t * aio_lock;
208 static lck_grp_t * aio_lock_grp;
209 static lck_attr_t * aio_lock_attr;
210 static lck_grp_attr_t * aio_lock_grp_attr;
211 static struct zone *aio_workq_zonep;
212
213
214
215
216 /*
217 * aio_cancel - attempt to cancel one or more async IO requests currently
218 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
219 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
220 * is NULL then all outstanding async IO request for the given file
221 * descriptor are cancelled (if possible).
222 */
223
224 int
225 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
226 {
227 struct user_aiocb my_aiocb;
228 int result;
229
230 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
231 (int)p, (int)uap->aiocbp, 0, 0, 0 );
232
233 /* quick check to see if there are any async IO requests queued up */
234 AIO_LOCK;
235 result = aio_get_all_queues_count( );
236 AIO_UNLOCK;
237 if ( result < 1 ) {
238 result = EBADF;
239 goto ExitRoutine;
240 }
241
242 *retval = -1;
243 if ( uap->aiocbp != USER_ADDR_NULL ) {
244 if ( !IS_64BIT_PROCESS(p) ) {
245 struct aiocb aiocb32;
246
247 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
248 if ( result == 0 )
249 do_munge_aiocb( &aiocb32, &my_aiocb );
250 } else
251 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
252
253 if ( result != 0 ) {
254 result = EAGAIN;
255 goto ExitRoutine;
256 }
257
258 /* NOTE - POSIX standard says a mismatch between the file */
259 /* descriptor passed in and the file descriptor embedded in */
260 /* the aiocb causes unspecified results. We return EBADF in */
261 /* that situation. */
262 if ( uap->fd != my_aiocb.aio_fildes ) {
263 result = EBADF;
264 goto ExitRoutine;
265 }
266 }
267 result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
268
269 if ( result != -1 ) {
270 *retval = result;
271 result = 0;
272 goto ExitRoutine;
273 }
274
275 result = EBADF;
276
277 ExitRoutine:
278 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
279 (int)p, (int)uap->aiocbp, result, 0, 0 );
280
281 return( result );
282
283 } /* aio_cancel */
284
285
286 /*
287 * _aio_close - internal function used to clean up async IO requests for
288 * a file descriptor that is closing.
289 * THIS MAY BLOCK.
290 */
291
292 __private_extern__ void
293 _aio_close( struct proc *p, int fd )
294 {
295 int error, count;
296
297 /* quick check to see if there are any async IO requests queued up */
298 AIO_LOCK;
299 count = aio_get_all_queues_count( );
300 AIO_UNLOCK;
301 if ( count < 1 )
302 return;
303
304 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
305 (int)p, fd, 0, 0, 0 );
306
307 /* cancel all async IO requests on our todo queues for this file descriptor */
308 error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
309 if ( error == AIO_NOTCANCELED ) {
310 /*
311 * AIO_NOTCANCELED is returned when we find an aio request for this process
312 * and file descriptor on the active async IO queue. Active requests cannot
313 * be cancelled so we must wait for them to complete. We will get a special
314 * wake up call on our channel used to sleep for ALL active requests to
315 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
316 * when we must wait for all active aio requests.
317 */
318
319 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
320 (int)p, fd, 0, 0, 0 );
321
322 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
323 }
324
325 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
326 (int)p, fd, 0, 0, 0 );
327
328 return;
329
330 } /* _aio_close */
331
332
333 /*
334 * aio_error - return the error status associated with the async IO
335 * request referred to by uap->aiocbp. The error status is the errno
336 * value that would be set by the corresponding IO request (read, wrtie,
337 * fdatasync, or sync).
338 */
339
340 int
341 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
342 {
343 aio_workq_entry *entryp;
344 int error;
345
346 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
347 (int)p, (int)uap->aiocbp, 0, 0, 0 );
348
349 AIO_LOCK;
350
351 /* quick check to see if there are any async IO requests queued up */
352 if ( aio_get_all_queues_count( ) < 1 ) {
353 error = EINVAL;
354 goto ExitRoutine;
355 }
356
357 /* look for a match on our queue of async IO requests that have completed */
358 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
359 if ( entryp->uaiocbp == uap->aiocbp ) {
360 *retval = entryp->errorval;
361 error = 0;
362 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
363 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
364 goto ExitRoutine;
365 }
366 }
367
368 /* look for a match on our queue of active async IO requests */
369 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
370 if ( entryp->uaiocbp == uap->aiocbp ) {
371 *retval = EINPROGRESS;
372 error = 0;
373 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
374 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
375 goto ExitRoutine;
376 }
377 }
378
379 /* look for a match on our queue of todo work */
380 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
381 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
382 *retval = EINPROGRESS;
383 error = 0;
384 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
385 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
386 goto ExitRoutine;
387 }
388 }
389 error = EINVAL;
390
391 ExitRoutine:
392 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
393 (int)p, (int)uap->aiocbp, error, 0, 0 );
394 AIO_UNLOCK;
395
396 return( error );
397
398 } /* aio_error */
399
400
401 /*
402 * aio_fsync - asynchronously force all IO operations associated
403 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
404 * queued at the time of the call to the synchronized completion state.
405 * NOTE - we do not support op O_DSYNC at this point since we do not support the
406 * fdatasync() call.
407 */
408
409 int
410 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
411 {
412 int error;
413 int fsync_kind;
414
415 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
416 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
417
418 *retval = 0;
419 /* 0 := O_SYNC for binary backward compatibility with Panther */
420 if (uap->op == O_SYNC || uap->op == 0)
421 fsync_kind = AIO_FSYNC;
422 #if 0 // we don't support fdatasync() call yet
423 else if ( uap->op == O_DSYNC )
424 fsync_kind = AIO_DSYNC;
425 #endif
426 else {
427 *retval = -1;
428 error = EINVAL;
429 goto ExitRoutine;
430 }
431
432 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
433 if ( error != 0 )
434 *retval = -1;
435
436 ExitRoutine:
437 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
438 (int)p, (int)uap->aiocbp, error, 0, 0 );
439
440 return( error );
441
442 } /* aio_fsync */
443
444
445 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
446 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
447 * (uap->aiocbp->aio_buf).
448 */
449
450 int
451 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
452 {
453 int error;
454
455 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
456 (int)p, (int)uap->aiocbp, 0, 0, 0 );
457
458 *retval = 0;
459
460 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
461 if ( error != 0 )
462 *retval = -1;
463
464 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
465 (int)p, (int)uap->aiocbp, error, 0, 0 );
466
467 return( error );
468
469 } /* aio_read */
470
471
472 /*
473 * aio_return - return the return status associated with the async IO
474 * request referred to by uap->aiocbp. The return status is the value
475 * that would be returned by corresponding IO request (read, wrtie,
476 * fdatasync, or sync). This is where we release kernel resources
477 * held for async IO call associated with the given aiocb pointer.
478 */
479
480 int
481 aio_return( struct proc *p, struct aio_return_args *uap, user_ssize_t *retval )
482 {
483 aio_workq_entry *entryp;
484 int error;
485 boolean_t lock_held;
486
487 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
488 (int)p, (int)uap->aiocbp, 0, 0, 0 );
489
490 AIO_LOCK;
491 lock_held = TRUE;
492 *retval = 0;
493
494 /* quick check to see if there are any async IO requests queued up */
495 if ( aio_get_all_queues_count( ) < 1 ) {
496 error = EINVAL;
497 goto ExitRoutine;
498 }
499
500 /* look for a match on our queue of async IO requests that have completed */
501 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
502 if ( entryp->uaiocbp == uap->aiocbp ) {
503 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
504 aio_anchor.aio_done_count--;
505 p->aio_done_count--;
506
507 *retval = entryp->returnval;
508
509 /* we cannot free requests that are still completing */
510 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
511 vm_map_t my_map;
512
513 my_map = entryp->aio_map;
514 entryp->aio_map = VM_MAP_NULL;
515 AIO_UNLOCK;
516 lock_held = FALSE;
517 aio_free_request( entryp, my_map );
518 }
519 else
520 /* tell completion code to free this request */
521 entryp->flags |= AIO_DO_FREE;
522 error = 0;
523 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
524 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
525 goto ExitRoutine;
526 }
527 }
528
529 /* look for a match on our queue of active async IO requests */
530 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
531 if ( entryp->uaiocbp == uap->aiocbp ) {
532 error = EINPROGRESS;
533 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
534 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
535 goto ExitRoutine;
536 }
537 }
538
539 /* look for a match on our queue of todo work */
540 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
541 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
542 error = EINPROGRESS;
543 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
544 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
545 goto ExitRoutine;
546 }
547 }
548 error = EINVAL;
549
550 ExitRoutine:
551 if ( lock_held )
552 AIO_UNLOCK;
553 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
554 (int)p, (int)uap->aiocbp, error, 0, 0 );
555
556 return( error );
557
558 } /* aio_return */
559
560
561 /*
562 * _aio_exec - internal function used to clean up async IO requests for
563 * a process that is going away due to exec(). We cancel any async IOs
564 * we can and wait for those already active. We also disable signaling
565 * for cancelled or active aio requests that complete.
566 * This routine MAY block!
567 */
568
569 __private_extern__ void
570 _aio_exec( struct proc *p )
571 {
572
573 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
574 (int)p, 0, 0, 0, 0 );
575
576 _aio_exit( p );
577
578 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
579 (int)p, 0, 0, 0, 0 );
580
581 return;
582
583 } /* _aio_exec */
584
585
586 /*
587 * _aio_exit - internal function used to clean up async IO requests for
588 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
589 * we can and wait for those already active. We also disable signaling
590 * for cancelled or active aio requests that complete. This routine MAY block!
591 */
592
593 __private_extern__ void
594 _aio_exit( struct proc *p )
595 {
596 int error, count;
597 aio_workq_entry *entryp;
598
599 /* quick check to see if there are any async IO requests queued up */
600 AIO_LOCK;
601 count = aio_get_all_queues_count( );
602 AIO_UNLOCK;
603 if ( count < 1 ) {
604 return;
605 }
606
607 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
608 (int)p, 0, 0, 0, 0 );
609
610 /*
611 * cancel async IO requests on the todo work queue and wait for those
612 * already active to complete.
613 */
614 error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
615 if ( error == AIO_NOTCANCELED ) {
616 /*
617 * AIO_NOTCANCELED is returned when we find an aio request for this process
618 * on the active async IO queue. Active requests cannot be cancelled so we
619 * must wait for them to complete. We will get a special wake up call on
620 * our channel used to sleep for ALL active requests to complete. This sleep
621 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
622 * active aio requests.
623 */
624
625 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
626 (int)p, 0, 0, 0, 0 );
627
628 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
629 }
630
631 /* release all aio resources used by this process */
632 AIO_LOCK;
633 entryp = TAILQ_FIRST( &p->aio_doneq );
634 while ( entryp != NULL ) {
635 aio_workq_entry *next_entryp;
636
637 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
638 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
639 aio_anchor.aio_done_count--;
640 p->aio_done_count--;
641
642 /* we cannot free requests that are still completing */
643 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
644 vm_map_t my_map;
645
646 my_map = entryp->aio_map;
647 entryp->aio_map = VM_MAP_NULL;
648 AIO_UNLOCK;
649 aio_free_request( entryp, my_map );
650
651 /* need to start over since aio_doneq may have been */
652 /* changed while we were away. */
653 AIO_LOCK;
654 entryp = TAILQ_FIRST( &p->aio_doneq );
655 continue;
656 }
657 else
658 /* tell completion code to free this request */
659 entryp->flags |= AIO_DO_FREE;
660 entryp = next_entryp;
661 }
662 AIO_UNLOCK;
663
664 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
665 (int)p, 0, 0, 0, 0 );
666
667 return;
668
669 } /* _aio_exit */
670
671
672 /*
673 * do_aio_cancel - cancel async IO requests (if possible). We get called by
674 * aio_cancel, close, and at exit.
675 * There are three modes of operation: 1) cancel all async IOs for a process -
676 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
677 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
678 * aiocbp.
679 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
680 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
681 * target async IO requests, and AIO_ALLDONE if all target async IO requests
682 * were already complete.
683 * WARNING - do not deference aiocbp in this routine, it may point to user
684 * land data that has not been copied in (when called from aio_cancel() )
685 */
686
687 static int
688 do_aio_cancel( struct proc *p, int fd, user_addr_t aiocbp,
689 boolean_t wait_for_completion, boolean_t disable_notification )
690 {
691 aio_workq_entry *entryp;
692 int result;
693
694 result = -1;
695
696 /* look for a match on our queue of async todo work. */
697 AIO_LOCK;
698 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
699 while ( entryp != NULL ) {
700 aio_workq_entry *next_entryp;
701
702 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
703 if ( p == entryp->procp ) {
704 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
705 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
706 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
707 /* we found a match so we remove the entry from the */
708 /* todo work queue and place it on the done queue */
709 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
710 aio_anchor.aio_async_workq_count--;
711 entryp->errorval = ECANCELED;
712 entryp->returnval = -1;
713 if ( disable_notification )
714 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
715 result = AIO_CANCELED;
716
717 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
718 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
719
720 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
721 aio_anchor.aio_done_count++;
722 p->aio_done_count++;
723 entryp->flags |= AIO_COMPLETION;
724 AIO_UNLOCK;
725
726 /* do completion processing for this request */
727 do_aio_completion( entryp );
728
729 AIO_LOCK;
730 entryp->flags &= ~AIO_COMPLETION;
731 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
732 vm_map_t my_map;
733
734 my_map = entryp->aio_map;
735 entryp->aio_map = VM_MAP_NULL;
736 AIO_UNLOCK;
737 aio_free_request( entryp, my_map );
738 }
739 else
740 AIO_UNLOCK;
741
742 if ( aiocbp != USER_ADDR_NULL ) {
743 return( result );
744 }
745
746 /* need to start over since aio_async_workq may have been */
747 /* changed while we were away doing completion processing. */
748 AIO_LOCK;
749 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
750 continue;
751 }
752 }
753 entryp = next_entryp;
754 } /* while... */
755
756 /*
757 * look for a match on our queue of synchronous todo work. This will
758 * be a rare occurrence but could happen if a process is terminated while
759 * processing a lio_listio call.
760 */
761 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
762 while ( entryp != NULL ) {
763 aio_workq_entry *next_entryp;
764
765 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
766 if ( p == entryp->procp ) {
767 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
768 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
769 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
770 /* we found a match so we remove the entry from the */
771 /* todo work queue and place it on the done queue */
772 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
773 aio_anchor.lio_sync_workq_count--;
774 entryp->errorval = ECANCELED;
775 entryp->returnval = -1;
776 if ( disable_notification )
777 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
778 result = AIO_CANCELED;
779
780 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
781 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
782
783 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
784 aio_anchor.aio_done_count++;
785 p->aio_done_count++;
786 if ( aiocbp != USER_ADDR_NULL ) {
787 AIO_UNLOCK;
788 return( result );
789 }
790 }
791 }
792 entryp = next_entryp;
793 } /* while... */
794
795 /*
796 * look for a match on our queue of active async IO requests and
797 * return AIO_NOTCANCELED result.
798 */
799 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
800 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
801 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
802 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
803 result = AIO_NOTCANCELED;
804
805 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
806 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
807
808 if ( wait_for_completion )
809 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
810 if ( disable_notification )
811 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
812 if ( aiocbp != USER_ADDR_NULL ) {
813 AIO_UNLOCK;
814 return( result );
815 }
816 }
817 }
818
819 /*
820 * if we didn't find any matches on the todo or active queues then look for a
821 * match on our queue of async IO requests that have completed and if found
822 * return AIO_ALLDONE result.
823 */
824 if ( result == -1 ) {
825 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
826 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
827 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
828 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
829 result = AIO_ALLDONE;
830
831 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
832 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
833
834 if ( aiocbp != USER_ADDR_NULL ) {
835 AIO_UNLOCK;
836 return( result );
837 }
838 }
839 }
840 }
841 AIO_UNLOCK;
842
843 return( result );
844
845 } /* do_aio_cancel */
846
847
848 /*
849 * aio_suspend - suspend the calling thread until at least one of the async
850 * IO operations referenced by uap->aiocblist has completed, until a signal
851 * interrupts the function, or uap->timeoutp time interval (optional) has
852 * passed.
853 * Returns 0 if one or more async IOs have completed else -1 and errno is
854 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
855 * woke us up.
856 */
857
858 int
859 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
860 {
861 int error;
862 int i, count;
863 uint64_t abstime;
864 struct user_timespec ts;
865 aio_workq_entry *entryp;
866 user_addr_t *aiocbpp;
867
868 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
869 (int)p, uap->nent, 0, 0, 0 );
870
871 *retval = -1;
872 abstime = 0;
873 aiocbpp = NULL;
874
875 /* quick check to see if there are any async IO requests queued up */
876 AIO_LOCK;
877 count = aio_get_all_queues_count( );
878 AIO_UNLOCK;
879 if ( count < 1 ) {
880 error = EINVAL;
881 goto ExitThisRoutine;
882 }
883
884 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
885 error = EINVAL;
886 goto ExitThisRoutine;
887 }
888
889 if ( uap->timeoutp != USER_ADDR_NULL ) {
890 if ( proc_is64bit(p) ) {
891 error = copyin( uap->timeoutp, &ts, sizeof(ts) );
892 }
893 else {
894 struct timespec temp;
895 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
896 if ( error == 0 ) {
897 ts.tv_sec = temp.tv_sec;
898 ts.tv_nsec = temp.tv_nsec;
899 }
900 }
901 if ( error != 0 ) {
902 error = EAGAIN;
903 goto ExitThisRoutine;
904 }
905
906 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
907 error = EINVAL;
908 goto ExitThisRoutine;
909 }
910
911 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
912 &abstime );
913 clock_absolutetime_interval_to_deadline( abstime, &abstime );
914 }
915
916 /* we reserve enough space for largest possible pointer size */
917 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
918 if ( aiocbpp == NULL ) {
919 error = EAGAIN;
920 goto ExitThisRoutine;
921 }
922
923 /* copyin our aiocb pointers from list */
924 error = copyin( uap->aiocblist, aiocbpp,
925 proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
926 : (uap->nent * sizeof(uintptr_t)) );
927 if ( error != 0 ) {
928 error = EAGAIN;
929 goto ExitThisRoutine;
930 }
931
932 /* we depend on a list of user_addr_t's so we need to munge and expand */
933 /* when these pointers came from a 32-bit process */
934 if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
935 /* position to the last entry and work back from there */
936 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
937 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
938 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
939 *my_addrp = (user_addr_t) (*my_ptrp);
940 }
941 }
942
943 /* check list of aio requests to see if any have completed */
944 AIO_LOCK;
945 for ( i = 0; i < uap->nent; i++ ) {
946 user_addr_t aiocbp;
947
948 /* NULL elements are legal so check for 'em */
949 aiocbp = *(aiocbpp + i);
950 if ( aiocbp == USER_ADDR_NULL )
951 continue;
952
953 /* return immediately if any aio request in the list is done */
954 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
955 if ( entryp->uaiocbp == aiocbp ) {
956 *retval = 0;
957 error = 0;
958 AIO_UNLOCK;
959 goto ExitThisRoutine;
960 }
961 }
962 } /* for ( ; i < uap->nent; ) */
963
964 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
965 (int)p, uap->nent, 0, 0, 0 );
966
967 /*
968 * wait for an async IO to complete or a signal fires or timeout expires.
969 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
970 * interrupts us. If an async IO completes before a signal fires or our
971 * timeout expires, we get a wakeup call from aio_work_thread().
972 */
973 assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
974 AIO_UNLOCK;
975
976 error = thread_block( THREAD_CONTINUE_NULL );
977
978 if ( error == THREAD_AWAKENED ) {
979 /* got our wakeup call from aio_work_thread() */
980 *retval = 0;
981 error = 0;
982 }
983 else if ( error == THREAD_TIMED_OUT ) {
984 /* our timeout expired */
985 error = EAGAIN;
986 }
987 else {
988 /* we were interrupted */
989 error = EINTR;
990 }
991
992 ExitThisRoutine:
993 if ( aiocbpp != NULL )
994 FREE( aiocbpp, M_TEMP );
995
996 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
997 (int)p, uap->nent, error, 0, 0 );
998
999 return( error );
1000
1001 } /* aio_suspend */
1002
1003
1004 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1005 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1006 * (uap->aiocbp->aio_buf).
1007 */
1008
1009 int
1010 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1011 {
1012 int error;
1013
1014 *retval = 0;
1015
1016 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1017 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1018
1019 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1020 if ( error != 0 )
1021 *retval = -1;
1022
1023 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1024 (int)p, (int)uap->aiocbp, error, 0, 0 );
1025
1026 return( error );
1027
1028 } /* aio_write */
1029
1030
1031 /*
1032 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1033 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1034 * The caller gets error and return status for each aiocb in the list via aio_error
1035 * and aio_return. We must keep completed requests until released by the
1036 * aio_return call.
1037 */
1038
1039 int
1040 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1041 {
1042 int i;
1043 int call_result;
1044 int result;
1045 long group_tag;
1046 aio_workq_entry * *entryp_listp;
1047 user_addr_t *aiocbpp;
1048
1049 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1050 (int)p, uap->nent, uap->mode, 0, 0 );
1051
1052 entryp_listp = NULL;
1053 aiocbpp = NULL;
1054 call_result = -1;
1055 *retval = -1;
1056 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1057 call_result = EINVAL;
1058 goto ExitRoutine;
1059 }
1060
1061 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1062 call_result = EINVAL;
1063 goto ExitRoutine;
1064 }
1065
1066 /*
1067 * we use group_tag to mark IO requests for delayed completion processing
1068 * which means we wait until all IO requests in the group have completed
1069 * before we either return to the caller when mode is LIO_WAIT or signal
1070 * user when mode is LIO_NOWAIT.
1071 */
1072 group_tag = random();
1073
1074 /*
1075 * allocate a list of aio_workq_entry pointers that we will use to queue
1076 * up all our requests at once while holding our lock.
1077 */
1078 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1079 if ( entryp_listp == NULL ) {
1080 call_result = EAGAIN;
1081 goto ExitRoutine;
1082 }
1083
1084 /* we reserve enough space for largest possible pointer size */
1085 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1086 if ( aiocbpp == NULL ) {
1087 call_result = EAGAIN;
1088 goto ExitRoutine;
1089 }
1090
1091 /* copyin our aiocb pointers from list */
1092 result = copyin( uap->aiocblist, aiocbpp,
1093 IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1094 : (uap->nent * sizeof(uintptr_t)) );
1095 if ( result != 0 ) {
1096 call_result = EAGAIN;
1097 goto ExitRoutine;
1098 }
1099
1100 /* we depend on a list of user_addr_t's so we need to munge and expand */
1101 /* when these pointers came from a 32-bit process */
1102 if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1103 /* position to the last entry and work back from there */
1104 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1105 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1106 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1107 *my_addrp = (user_addr_t) (*my_ptrp);
1108 }
1109 }
1110
1111 /* process list of aio requests */
1112 for ( i = 0; i < uap->nent; i++ ) {
1113 user_addr_t my_aiocbp;
1114
1115 *(entryp_listp + i) = NULL;
1116 my_aiocbp = *(aiocbpp + i);
1117
1118 /* NULL elements are legal so check for 'em */
1119 if ( my_aiocbp == USER_ADDR_NULL )
1120 continue;
1121
1122 if ( uap->mode == LIO_NOWAIT )
1123 result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1124 group_tag, (entryp_listp + i) );
1125 else
1126 result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1127 (entryp_listp + i) );
1128
1129 if ( result != 0 && call_result == -1 )
1130 call_result = result;
1131 }
1132
1133 /*
1134 * we need to protect this section since we do not want any of these grouped
1135 * IO requests to begin until we have them all on the queue.
1136 */
1137 AIO_LOCK;
1138 for ( i = 0; i < uap->nent; i++ ) {
1139 aio_workq_entry *entryp;
1140
1141 /* NULL elements are legal so check for 'em */
1142 entryp = *(entryp_listp + i);
1143 if ( entryp == NULL )
1144 continue;
1145
1146 /* check our aio limits to throttle bad or rude user land behavior */
1147 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1148 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1149 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1150 vm_map_t my_map;
1151
1152 my_map = entryp->aio_map;
1153 entryp->aio_map = VM_MAP_NULL;
1154 if ( call_result == -1 )
1155 call_result = EAGAIN;
1156 AIO_UNLOCK;
1157 aio_free_request( entryp, my_map );
1158 AIO_LOCK;
1159 continue;
1160 }
1161
1162 /* place the request on the appropriate queue */
1163 if ( uap->mode == LIO_NOWAIT ) {
1164 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1165 aio_anchor.aio_async_workq_count++;
1166
1167 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1168 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1169 }
1170 else {
1171 TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1172 aio_anchor.lio_sync_workq_count++;
1173 }
1174 }
1175
1176 if ( uap->mode == LIO_NOWAIT ) {
1177 /* caller does not want to wait so we'll fire off a worker thread and return */
1178 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1179 }
1180 else {
1181 aio_workq_entry *entryp;
1182 int error;
1183
1184 /*
1185 * mode is LIO_WAIT - handle the IO requests now.
1186 */
1187 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1188 while ( entryp != NULL ) {
1189 if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1190
1191 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1192 aio_anchor.lio_sync_workq_count--;
1193 AIO_UNLOCK;
1194
1195 if ( (entryp->flags & AIO_READ) != 0 ) {
1196 error = do_aio_read( entryp );
1197 }
1198 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1199 error = do_aio_write( entryp );
1200 }
1201 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1202 error = do_aio_fsync( entryp );
1203 }
1204 else {
1205 printf( "%s - unknown aio request - flags 0x%02X \n",
1206 __FUNCTION__, entryp->flags );
1207 error = EINVAL;
1208 }
1209 entryp->errorval = error;
1210 if ( error != 0 && call_result == -1 )
1211 call_result = EIO;
1212
1213 AIO_LOCK;
1214 /* we're done with the IO request so move it on the done queue */
1215 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1216 aio_anchor.aio_done_count++;
1217 p->aio_done_count++;
1218
1219 /* need to start over since lio_sync_workq may have been changed while we */
1220 /* were away doing the IO. */
1221 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1222 continue;
1223 } /* p == entryp->procp */
1224
1225 entryp = TAILQ_NEXT( entryp, aio_workq_link );
1226 } /* while ( entryp != NULL ) */
1227 } /* uap->mode == LIO_WAIT */
1228 AIO_UNLOCK;
1229
1230 /* call_result == -1 means we had no trouble queueing up requests */
1231 if ( call_result == -1 ) {
1232 call_result = 0;
1233 *retval = 0;
1234 }
1235
1236 ExitRoutine:
1237 if ( entryp_listp != NULL )
1238 FREE( entryp_listp, M_TEMP );
1239 if ( aiocbpp != NULL )
1240 FREE( aiocbpp, M_TEMP );
1241
1242 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1243 (int)p, call_result, 0, 0, 0 );
1244
1245 return( call_result );
1246
1247 } /* lio_listio */
1248
1249
1250 /*
1251 * aio worker thread. this is where all the real work gets done.
1252 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1253 * after new work is queued up.
1254 */
1255
1256 static void
1257 aio_work_thread( void )
1258 {
1259 aio_workq_entry *entryp;
1260
1261 for( ;; ) {
1262 AIO_LOCK;
1263 entryp = aio_get_some_work();
1264 if ( entryp == NULL ) {
1265 /*
1266 * aio worker threads wait for some work to get queued up
1267 * by aio_queue_async_request. Once some work gets queued
1268 * it will wake up one of these worker threads just before
1269 * returning to our caller in user land.
1270 */
1271 assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1272 AIO_UNLOCK;
1273
1274 thread_block( (thread_continue_t)aio_work_thread );
1275 /* NOT REACHED */
1276 }
1277 else {
1278 int error;
1279 vm_map_t currentmap;
1280 vm_map_t oldmap = VM_MAP_NULL;
1281 task_t oldaiotask = TASK_NULL;
1282 struct uthread *uthreadp = NULL;
1283
1284 AIO_UNLOCK;
1285
1286 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1287 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1288
1289 /*
1290 * Assume the target's address space identity for the duration
1291 * of the IO.
1292 */
1293 currentmap = get_task_map( (current_proc())->task );
1294 if ( currentmap != entryp->aio_map ) {
1295 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1296 oldaiotask = uthreadp->uu_aio_task;
1297 uthreadp->uu_aio_task = entryp->procp->task;
1298 oldmap = vm_map_switch( entryp->aio_map );
1299 }
1300
1301 if ( (entryp->flags & AIO_READ) != 0 ) {
1302 error = do_aio_read( entryp );
1303 }
1304 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1305 error = do_aio_write( entryp );
1306 }
1307 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1308 error = do_aio_fsync( entryp );
1309 }
1310 else {
1311 printf( "%s - unknown aio request - flags 0x%02X \n",
1312 __FUNCTION__, entryp->flags );
1313 error = EINVAL;
1314 }
1315 entryp->errorval = error;
1316 if ( currentmap != entryp->aio_map ) {
1317 (void) vm_map_switch( oldmap );
1318 uthreadp->uu_aio_task = oldaiotask;
1319 }
1320
1321 /* we're done with the IO request so pop it off the active queue and */
1322 /* push it on the done queue */
1323 AIO_LOCK;
1324 TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1325 aio_anchor.aio_active_count--;
1326 entryp->procp->aio_active_count--;
1327 TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1328 aio_anchor.aio_done_count++;
1329 entryp->procp->aio_done_count++;
1330 entryp->flags |= AIO_COMPLETION;
1331
1332 /* remove our reference to the user land map. */
1333 if ( VM_MAP_NULL != entryp->aio_map ) {
1334 vm_map_t my_map;
1335
1336 my_map = entryp->aio_map;
1337 entryp->aio_map = VM_MAP_NULL;
1338 AIO_UNLOCK; /* must unlock before calling vm_map_deallocate() */
1339 vm_map_deallocate( my_map );
1340 }
1341 else {
1342 AIO_UNLOCK;
1343 }
1344
1345 do_aio_completion( entryp );
1346
1347 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1348 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1349 entryp->returnval, 0 );
1350
1351 AIO_LOCK;
1352 entryp->flags &= ~AIO_COMPLETION;
1353 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1354 vm_map_t my_map;
1355
1356 my_map = entryp->aio_map;
1357 entryp->aio_map = VM_MAP_NULL;
1358 AIO_UNLOCK;
1359 aio_free_request( entryp, my_map );
1360 }
1361 else
1362 AIO_UNLOCK;
1363 }
1364 } /* for ( ;; ) */
1365
1366 /* NOT REACHED */
1367
1368 } /* aio_work_thread */
1369
1370
1371 /*
1372 * aio_get_some_work - get the next async IO request that is ready to be executed.
1373 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1374 * IO requests at the time the aio_fsync call came in have completed.
1375 * NOTE - AIO_LOCK must be held by caller
1376 */
1377
1378 static aio_workq_entry *
1379 aio_get_some_work( void )
1380 {
1381 aio_workq_entry *entryp;
1382
1383 /* pop some work off the work queue and add to our active queue */
1384 for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1385 entryp != NULL;
1386 entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1387
1388 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1389 /* leave aio_fsync calls on the work queue if there are IO */
1390 /* requests on the active queue for the same file descriptor. */
1391 if ( aio_delay_fsync_request( entryp ) ) {
1392
1393 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1394 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1395 continue;
1396 }
1397 }
1398 break;
1399 }
1400
1401 if ( entryp != NULL ) {
1402 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1403 aio_anchor.aio_async_workq_count--;
1404 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1405 aio_anchor.aio_active_count++;
1406 entryp->procp->aio_active_count++;
1407 }
1408
1409 return( entryp );
1410
1411 } /* aio_get_some_work */
1412
1413
1414 /*
1415 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1416 * this time. Delay will happen when there are any active IOs for the same file
1417 * descriptor that were queued at time the aio_sync call was queued.
1418 * NOTE - AIO_LOCK must be held by caller
1419 */
1420 static boolean_t
1421 aio_delay_fsync_request( aio_workq_entry *entryp )
1422 {
1423 aio_workq_entry *my_entryp;
1424
1425 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1426 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1427 entryp->uaiocbp == my_entryp->fsyncp &&
1428 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1429 return( TRUE );
1430 }
1431 }
1432
1433 return( FALSE );
1434
1435 } /* aio_delay_fsync_request */
1436
1437
1438 /*
1439 * aio_queue_async_request - queue up an async IO request on our work queue then
1440 * wake up one of our worker threads to do the actual work. We get a reference
1441 * to our caller's user land map in order to keep it around while we are
1442 * processing the request.
1443 */
1444
1445 static int
1446 aio_queue_async_request( struct proc *procp, user_addr_t aiocbp, int kindOfIO )
1447 {
1448 aio_workq_entry *entryp;
1449 int result;
1450
1451 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1452 if ( entryp == NULL ) {
1453 result = EAGAIN;
1454 goto error_exit;
1455 }
1456 bzero( entryp, sizeof(*entryp) );
1457
1458 /* fill in the rest of the aio_workq_entry */
1459 entryp->procp = procp;
1460 entryp->uaiocbp = aiocbp;
1461 entryp->flags |= kindOfIO;
1462 entryp->aio_map = VM_MAP_NULL;
1463
1464 if ( !IS_64BIT_PROCESS(procp) ) {
1465 struct aiocb aiocb32;
1466
1467 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1468 if ( result == 0 )
1469 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1470 } else
1471 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1472
1473 if ( result != 0 ) {
1474 result = EAGAIN;
1475 goto error_exit;
1476 }
1477
1478 /* do some more validation on the aiocb and embedded file descriptor */
1479 result = aio_validate( entryp );
1480 if ( result != 0 )
1481 goto error_exit;
1482
1483 /* get a reference to the user land map in order to keep it around */
1484 entryp->aio_map = get_task_map( procp->task );
1485 vm_map_reference( entryp->aio_map );
1486
1487 AIO_LOCK;
1488
1489 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1490 AIO_UNLOCK;
1491 result = EAGAIN;
1492 goto error_exit;
1493 }
1494
1495 /* check our aio limits to throttle bad or rude user land behavior */
1496 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1497 aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1498 AIO_UNLOCK;
1499 result = EAGAIN;
1500 goto error_exit;
1501 }
1502
1503 /*
1504 * aio_fsync calls sync up all async IO requests queued at the time
1505 * the aio_fsync call was made. So we mark each currently queued async
1506 * IO with a matching file descriptor as must complete before we do the
1507 * fsync. We set the fsyncp field of each matching async IO
1508 * request with the aiocb pointer passed in on the aio_fsync call to
1509 * know which IOs must complete before we process the aio_fsync call.
1510 */
1511 if ( (kindOfIO & AIO_FSYNC) != 0 )
1512 aio_mark_requests( entryp );
1513
1514 /* queue up on our aio asynchronous work queue */
1515 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1516 aio_anchor.aio_async_workq_count++;
1517
1518 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1519 AIO_UNLOCK;
1520
1521 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1522 (int)procp, (int)aiocbp, 0, 0, 0 );
1523
1524 return( 0 );
1525
1526 error_exit:
1527 if ( entryp != NULL ) {
1528 /* this entry has not been queued up so no worries about unlocked */
1529 /* state and aio_map */
1530 aio_free_request( entryp, entryp->aio_map );
1531 }
1532
1533 return( result );
1534
1535 } /* aio_queue_async_request */
1536
1537
1538 /*
1539 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1540 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1541 * our caller. We get a reference to our caller's user land map in order to keep
1542 * it around while we are processing the request.
1543 * lio_listio calls behave differently at completion they do completion notification
1544 * when all async IO requests have completed. We use group_tag to tag IO requests
1545 * that behave in the delay notification manner.
1546 */
1547
1548 static int
1549 lio_create_async_entry( struct proc *procp, user_addr_t aiocbp,
1550 user_addr_t sigp, long group_tag,
1551 aio_workq_entry **entrypp )
1552 {
1553 aio_workq_entry *entryp;
1554 int result;
1555
1556 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1557 if ( entryp == NULL ) {
1558 result = EAGAIN;
1559 goto error_exit;
1560 }
1561 bzero( entryp, sizeof(*entryp) );
1562
1563 /* fill in the rest of the aio_workq_entry */
1564 entryp->procp = procp;
1565 entryp->uaiocbp = aiocbp;
1566 entryp->flags |= AIO_LIO;
1567 entryp->group_tag = group_tag;
1568 entryp->aio_map = VM_MAP_NULL;
1569
1570 if ( !IS_64BIT_PROCESS(procp) ) {
1571 struct aiocb aiocb32;
1572
1573 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1574 if ( result == 0 )
1575 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1576 } else
1577 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1578
1579 if ( result != 0 ) {
1580 result = EAGAIN;
1581 goto error_exit;
1582 }
1583
1584 /* look for lio_listio LIO_NOP requests and ignore them. */
1585 /* Not really an error, but we need to free our aio_workq_entry. */
1586 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1587 result = 0;
1588 goto error_exit;
1589 }
1590
1591 /* use sigevent passed in to lio_listio for each of our calls, but only */
1592 /* do completion notification after the last request completes. */
1593 if ( sigp != USER_ADDR_NULL ) {
1594 if ( !IS_64BIT_PROCESS(procp) ) {
1595 struct sigevent sigevent32;
1596
1597 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1598 if ( result == 0 ) {
1599 /* also need to munge aio_sigevent since it contains pointers */
1600 /* special case here. since we do not know if sigev_value is an */
1601 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
1602 /* means if we send this info back to user space we need to remember */
1603 /* sigev_value was not expanded for the 32-bit case. */
1604 /* NOTE - this does NOT affect us since we don't support sigev_value */
1605 /* yet in the aio context. */
1606 //LP64
1607 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1608 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1609 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1610 sigevent32.sigev_value.sival_int;
1611 entryp->aiocb.aio_sigevent.sigev_notify_function =
1612 CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1613 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1614 CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1615 }
1616 } else
1617 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1618
1619 if ( result != 0 ) {
1620 result = EAGAIN;
1621 goto error_exit;
1622 }
1623 }
1624
1625 /* do some more validation on the aiocb and embedded file descriptor */
1626 result = aio_validate( entryp );
1627 if ( result != 0 )
1628 goto error_exit;
1629
1630 /* get a reference to the user land map in order to keep it around */
1631 entryp->aio_map = get_task_map( procp->task );
1632 vm_map_reference( entryp->aio_map );
1633
1634 *entrypp = entryp;
1635 return( 0 );
1636
1637 error_exit:
1638 if ( entryp != NULL )
1639 zfree( aio_workq_zonep, entryp );
1640
1641 return( result );
1642
1643 } /* lio_create_async_entry */
1644
1645
1646 /*
1647 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1648 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1649 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1650 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1651 * NOTE - AIO_LOCK must be held by caller
1652 */
1653
1654 static void
1655 aio_mark_requests( aio_workq_entry *entryp )
1656 {
1657 aio_workq_entry *my_entryp;
1658
1659 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1660 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1661 my_entryp->fsyncp = entryp->uaiocbp;
1662 }
1663 }
1664
1665 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1666 if ( entryp->procp == my_entryp->procp &&
1667 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1668 my_entryp->fsyncp = entryp->uaiocbp;
1669 }
1670 }
1671
1672 } /* aio_mark_requests */
1673
1674
1675 /*
1676 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1677 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1678 * our caller.
1679 * lio_listio calls behave differently at completion they do completion notification
1680 * when all async IO requests have completed. We use group_tag to tag IO requests
1681 * that behave in the delay notification manner.
1682 */
1683
1684 static int
1685 lio_create_sync_entry( struct proc *procp, user_addr_t aiocbp,
1686 long group_tag, aio_workq_entry **entrypp )
1687 {
1688 aio_workq_entry *entryp;
1689 int result;
1690
1691 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1692 if ( entryp == NULL ) {
1693 result = EAGAIN;
1694 goto error_exit;
1695 }
1696 bzero( entryp, sizeof(*entryp) );
1697
1698 /* fill in the rest of the aio_workq_entry */
1699 entryp->procp = procp;
1700 entryp->uaiocbp = aiocbp;
1701 entryp->flags |= AIO_LIO;
1702 entryp->group_tag = group_tag;
1703 entryp->aio_map = VM_MAP_NULL;
1704
1705 if ( !IS_64BIT_PROCESS(procp) ) {
1706 struct aiocb aiocb32;
1707
1708 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1709 if ( result == 0 )
1710 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1711 } else
1712 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1713
1714 if ( result != 0 ) {
1715 result = EAGAIN;
1716 goto error_exit;
1717 }
1718
1719 /* look for lio_listio LIO_NOP requests and ignore them. */
1720 /* Not really an error, but we need to free our aio_workq_entry. */
1721 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1722 result = 0;
1723 goto error_exit;
1724 }
1725
1726 result = aio_validate( entryp );
1727 if ( result != 0 ) {
1728 goto error_exit;
1729 }
1730
1731 *entrypp = entryp;
1732 return( 0 );
1733
1734 error_exit:
1735 if ( entryp != NULL )
1736 zfree( aio_workq_zonep, entryp );
1737
1738 return( result );
1739
1740 } /* lio_create_sync_entry */
1741
1742
1743 /*
1744 * aio_free_request - remove our reference on the user land map and
1745 * free the work queue entry resources.
1746 * We are not holding the lock here thus aio_map is passed in and
1747 * zeroed while we did have the lock.
1748 */
1749
1750 static int
1751 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1752 {
1753 /* remove our reference to the user land map. */
1754 if ( VM_MAP_NULL != the_map ) {
1755 vm_map_deallocate( the_map );
1756 }
1757
1758 zfree( aio_workq_zonep, entryp );
1759
1760 return( 0 );
1761
1762 } /* aio_free_request */
1763
1764
1765 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1766 */
1767
1768 static int
1769 aio_validate( aio_workq_entry *entryp )
1770 {
1771 struct fileproc *fp;
1772 int flag;
1773 int result;
1774
1775 result = 0;
1776
1777 if ( (entryp->flags & AIO_LIO) != 0 ) {
1778 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1779 entryp->flags |= AIO_READ;
1780 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1781 entryp->flags |= AIO_WRITE;
1782 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1783 return( 0 );
1784 else
1785 return( EINVAL );
1786 }
1787
1788 flag = FREAD;
1789 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1790 flag = FWRITE;
1791 }
1792
1793 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1794 // LP64todo - does max value for aio_nbytes need to grow?
1795 if ( entryp->aiocb.aio_nbytes > INT_MAX ||
1796 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1797 entryp->aiocb.aio_offset < 0 )
1798 return( EINVAL );
1799 }
1800
1801 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1802 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1803 * sigev_notify_function, and sigev_notify_attributes are ignored.
1804 */
1805 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1806 int signum;
1807 /* make sure we have a valid signal number */
1808 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1809 if ( signum <= 0 || signum >= NSIG ||
1810 signum == SIGKILL || signum == SIGSTOP )
1811 return (EINVAL);
1812 }
1813 else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1814 return (EINVAL);
1815
1816 /* validate the file descriptor and that the file was opened
1817 * for the appropriate read / write access.
1818 */
1819 proc_fdlock(entryp->procp);
1820
1821 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1822 if ( result == 0 ) {
1823 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1824 /* we don't have read or write access */
1825 result = EBADF;
1826 }
1827 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1828 /* this is not a file */
1829 result = ESPIPE;
1830 } else
1831 fp->f_flags |= FP_AIOISSUED;
1832
1833 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1834 }
1835 else {
1836 result = EBADF;
1837 }
1838
1839 proc_fdunlock(entryp->procp);
1840
1841 return( result );
1842
1843 } /* aio_validate */
1844
1845
1846 /*
1847 * aio_get_process_count - runs through our queues that hold outstanding
1848 * async IO reqests and totals up number of requests for the given
1849 * process.
1850 * NOTE - caller must hold aio lock!
1851 */
1852
1853 static int
1854 aio_get_process_count( struct proc *procp )
1855 {
1856 aio_workq_entry *entryp;
1857 int count;
1858
1859 /* begin with count of completed async IO requests for this process */
1860 count = procp->aio_done_count;
1861
1862 /* add in count of active async IO requests for this process */
1863 count += procp->aio_active_count;
1864
1865 /* look for matches on our queue of asynchronous todo work */
1866 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1867 if ( procp == entryp->procp ) {
1868 count++;
1869 }
1870 }
1871
1872 /* look for matches on our queue of synchronous todo work */
1873 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1874 if ( procp == entryp->procp ) {
1875 count++;
1876 }
1877 }
1878
1879 return( count );
1880
1881 } /* aio_get_process_count */
1882
1883
1884 /*
1885 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1886 * NOTE - caller must hold aio lock!
1887 */
1888
1889 static int
1890 aio_get_all_queues_count( void )
1891 {
1892 int count;
1893
1894 count = aio_anchor.aio_async_workq_count;
1895 count += aio_anchor.lio_sync_workq_count;
1896 count += aio_anchor.aio_active_count;
1897 count += aio_anchor.aio_done_count;
1898
1899 return( count );
1900
1901 } /* aio_get_all_queues_count */
1902
1903
1904 /*
1905 * do_aio_completion. Handle async IO completion.
1906 */
1907
1908 static void
1909 do_aio_completion( aio_workq_entry *entryp )
1910 {
1911 /* signal user land process if appropriate */
1912 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1913 (entryp->flags & AIO_DISABLE) == 0 ) {
1914
1915 /*
1916 * if group_tag is non zero then make sure this is the last IO request
1917 * in the group before we signal.
1918 */
1919 if ( entryp->group_tag == 0 ||
1920 (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1921 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1922 (int)entryp->procp, (int)entryp->uaiocbp,
1923 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1924
1925 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1926 return;
1927 }
1928 }
1929
1930 /*
1931 * need to handle case where a process is trying to exit, exec, or close
1932 * and is currently waiting for active aio requests to complete. If
1933 * AIO_WAITING is set then we need to look to see if there are any
1934 * other requests in the active queue for this process. If there are
1935 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1936 * there are some still active then do nothing - we only want to wakeup
1937 * when all active aio requests for the process are complete.
1938 */
1939 if ( (entryp->flags & AIO_WAITING) != 0 ) {
1940 int active_requests;
1941
1942 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1943 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1944
1945 AIO_LOCK;
1946 active_requests = aio_active_requests_for_process( entryp->procp );
1947 //AIO_UNLOCK;
1948 if ( active_requests < 1 ) {
1949 /* no active aio requests for this process, continue exiting */
1950 wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1951
1952 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1953 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1954 }
1955 AIO_UNLOCK;
1956 return;
1957 }
1958
1959 /*
1960 * aio_suspend case when a signal was not requested. In that scenario we
1961 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1962 * NOTE - the assumption here is that this wakeup call is inexpensive.
1963 * we really only need to do this when an aio_suspend call is pending.
1964 * If we find the wakeup call should be avoided we could mark the
1965 * async IO requests given in the list provided by aio_suspend and only
1966 * call wakeup for them. If we do mark them we should unmark them after
1967 * the aio_suspend wakes up.
1968 */
1969 AIO_LOCK;
1970 wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1971 AIO_UNLOCK;
1972
1973 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1974 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1975
1976 return;
1977
1978 } /* do_aio_completion */
1979
1980
1981 /*
1982 * aio_last_group_io - checks to see if this is the last unfinished IO request
1983 * for the given group_tag. Returns TRUE if there are no other active IO
1984 * requests for this group or FALSE if the are active IO requests
1985 * NOTE - AIO_LOCK must be held by caller
1986 */
1987
1988 static boolean_t
1989 aio_last_group_io( aio_workq_entry *entryp )
1990 {
1991 aio_workq_entry *my_entryp;
1992
1993 /* look for matches on our queue of active async IO requests */
1994 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1995 if ( my_entryp->group_tag == entryp->group_tag )
1996 return( FALSE );
1997 }
1998
1999 /* look for matches on our queue of asynchronous todo work */
2000 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2001 if ( my_entryp->group_tag == entryp->group_tag )
2002 return( FALSE );
2003 }
2004
2005 /* look for matches on our queue of synchronous todo work */
2006 TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2007 if ( my_entryp->group_tag == entryp->group_tag )
2008 return( FALSE );
2009 }
2010
2011 return( TRUE );
2012
2013 } /* aio_last_group_io */
2014
2015
2016 /*
2017 * do_aio_read
2018 */
2019 static int
2020 do_aio_read( aio_workq_entry *entryp )
2021 {
2022 struct fileproc *fp;
2023 int error;
2024
2025 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2026 return(error);
2027 if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2028 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2029 return(EBADF);
2030 }
2031 if ( fp != NULL ) {
2032 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
2033 entryp->aiocb.aio_buf,
2034 entryp->aiocb.aio_nbytes,
2035 entryp->aiocb.aio_offset, FOF_OFFSET,
2036 &entryp->returnval );
2037 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2038 }
2039 else {
2040 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2041 error = EBADF;
2042 }
2043
2044 return( error );
2045
2046 } /* do_aio_read */
2047
2048
2049 /*
2050 * do_aio_write
2051 */
2052 static int
2053 do_aio_write( aio_workq_entry *entryp )
2054 {
2055 struct fileproc *fp;
2056 int error;
2057
2058 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2059 return(error);
2060 if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2061 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2062 return(EBADF);
2063 }
2064 if ( fp != NULL ) {
2065 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes,
2066 entryp->aiocb.aio_buf,
2067 entryp->aiocb.aio_nbytes,
2068 entryp->aiocb.aio_offset, FOF_OFFSET,
2069 &entryp->returnval );
2070
2071 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2072 }
2073 else {
2074 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2075 error = EBADF;
2076 }
2077
2078 return( error );
2079
2080 } /* do_aio_write */
2081
2082
2083 /*
2084 * aio_active_requests_for_process - return number of active async IO
2085 * requests for the given process.
2086 * NOTE - caller must hold aio lock!
2087 */
2088
2089 static int
2090 aio_active_requests_for_process( struct proc *procp )
2091 {
2092
2093 return( procp->aio_active_count );
2094
2095 } /* aio_active_requests_for_process */
2096
2097
2098 /*
2099 * do_aio_fsync
2100 */
2101 static int
2102 do_aio_fsync( aio_workq_entry *entryp )
2103 {
2104 struct vfs_context context;
2105 struct vnode *vp;
2106 struct fileproc *fp;
2107 int error;
2108
2109 /*
2110 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2111 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2112 * The following was shamelessly extracted from fsync() implementation.
2113 */
2114
2115 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2116 if ( error == 0 ) {
2117 if ( (error = vnode_getwithref(vp)) ) {
2118 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2119 entryp->returnval = -1;
2120 return(error);
2121 }
2122 context.vc_proc = entryp->procp;
2123 context.vc_ucred = fp->f_fglob->fg_cred;
2124
2125 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2126
2127 (void)vnode_put(vp);
2128
2129 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2130 }
2131 if ( error != 0 )
2132 entryp->returnval = -1;
2133
2134 return( error );
2135
2136 } /* do_aio_fsync */
2137
2138
2139 /*
2140 * is_already_queued - runs through our queues to see if the given
2141 * aiocbp / process is there. Returns TRUE if there is a match
2142 * on any of our aio queues.
2143 * NOTE - callers must hold aio lock!
2144 */
2145
2146 static boolean_t
2147 is_already_queued( struct proc *procp,
2148 user_addr_t aiocbp )
2149 {
2150 aio_workq_entry *entryp;
2151 boolean_t result;
2152
2153 result = FALSE;
2154
2155 /* look for matches on our queue of async IO requests that have completed */
2156 TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2157 if ( aiocbp == entryp->uaiocbp ) {
2158 result = TRUE;
2159 goto ExitThisRoutine;
2160 }
2161 }
2162
2163 /* look for matches on our queue of active async IO requests */
2164 TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2165 if ( aiocbp == entryp->uaiocbp ) {
2166 result = TRUE;
2167 goto ExitThisRoutine;
2168 }
2169 }
2170
2171 /* look for matches on our queue of asynchronous todo work */
2172 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2173 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2174 result = TRUE;
2175 goto ExitThisRoutine;
2176 }
2177 }
2178
2179 /* look for matches on our queue of synchronous todo work */
2180 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2181 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2182 result = TRUE;
2183 goto ExitThisRoutine;
2184 }
2185 }
2186
2187 ExitThisRoutine:
2188 return( result );
2189
2190 } /* is_already_queued */
2191
2192
2193 /*
2194 * aio initialization
2195 */
2196 __private_extern__ void
2197 aio_init( void )
2198 {
2199 int i;
2200
2201 aio_lock_grp_attr = lck_grp_attr_alloc_init();
2202 lck_grp_attr_setstat(aio_lock_grp_attr);
2203 aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2204 aio_lock_attr = lck_attr_alloc_init();
2205 //lck_attr_setdebug(aio_lock_attr);
2206
2207 aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2208
2209 AIO_LOCK;
2210 TAILQ_INIT( &aio_anchor.aio_async_workq );
2211 TAILQ_INIT( &aio_anchor.lio_sync_workq );
2212 aio_anchor.aio_async_workq_count = 0;
2213 aio_anchor.lio_sync_workq_count = 0;
2214 aio_anchor.aio_active_count = 0;
2215 aio_anchor.aio_done_count = 0;
2216 AIO_UNLOCK;
2217
2218 i = sizeof( aio_workq_entry );
2219 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2220
2221 _aio_create_worker_threads( aio_worker_threads );
2222
2223 return;
2224
2225 } /* aio_init */
2226
2227
2228 /*
2229 * aio worker threads created here.
2230 */
2231 __private_extern__ void
2232 _aio_create_worker_threads( int num )
2233 {
2234 int i;
2235
2236 /* create some worker threads to handle the async IO requests */
2237 for ( i = 0; i < num; i++ ) {
2238 thread_t myThread;
2239
2240 myThread = kernel_thread( kernel_task, aio_work_thread );
2241 if ( THREAD_NULL == myThread ) {
2242 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2243 }
2244 }
2245
2246 return;
2247
2248 } /* _aio_create_worker_threads */
2249
2250 /*
2251 * Return the current activation utask
2252 */
2253 task_t
2254 get_aiotask(void)
2255 {
2256 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2257 }
2258
2259
2260 /*
2261 * In the case of an aiocb from a
2262 * 32-bit process we need to expand some longs and pointers to the correct
2263 * sizes in order to let downstream code always work on the same type of
2264 * aiocb (in our case that is a user_aiocb)
2265 */
2266 static void
2267 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2268 {
2269 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2270 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2271 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2272 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2273 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2274 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2275
2276 /* special case here. since we do not know if sigev_value is an */
2277 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2278 /* means if we send this info back to user space we need to remember */
2279 /* sigev_value was not expanded for the 32-bit case. */
2280 /* NOTE - this does NOT affect us since we don't support sigev_value */
2281 /* yet in the aio context. */
2282 //LP64
2283 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2284 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2285 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2286 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2287 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2288 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2289 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2290 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2291 }