]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_aio.c
xnu-792.1.5.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
1 /*
2 * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23
24 /*
25 * todo:
26 * 1) ramesh is looking into how to replace taking a reference on
27 * the user's map (vm_map_reference()) since it is believed that
28 * would not hold the process for us.
29 * 2) david is looking into a way for us to set the priority of the
30 * worker threads to match that of the user's thread when the
31 * async IO was queued.
32 */
33
34
35 /*
36 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
37 */
38
39 #include <sys/systm.h>
40 #include <sys/fcntl.h>
41 #include <sys/file_internal.h>
42 #include <sys/filedesc.h>
43 #include <sys/kernel.h>
44 #include <sys/vnode_internal.h>
45 #include <sys/malloc.h>
46 #include <sys/mount_internal.h>
47 #include <sys/param.h>
48 #include <sys/proc_internal.h>
49 #include <sys/sysctl.h>
50 #include <sys/unistd.h>
51 #include <sys/user.h>
52
53 #include <sys/aio_kern.h>
54 #include <sys/sysproto.h>
55
56 #include <machine/limits.h>
57
58 #include <mach/mach_types.h>
59 #include <kern/kern_types.h>
60 #include <kern/zalloc.h>
61 #include <kern/task.h>
62 #include <kern/sched_prim.h>
63
64 #include <vm/vm_map.h>
65
66 #include <sys/kdebug.h>
67 #define AIO_work_queued 1
68 #define AIO_worker_wake 2
69 #define AIO_completion_sig 3
70 #define AIO_completion_cleanup_wait 4
71 #define AIO_completion_cleanup_wake 5
72 #define AIO_completion_suspend_wake 6
73 #define AIO_fsync_delay 7
74 #define AIO_cancel 10
75 #define AIO_cancel_async_workq 11
76 #define AIO_cancel_sync_workq 12
77 #define AIO_cancel_activeq 13
78 #define AIO_cancel_doneq 14
79 #define AIO_fsync 20
80 #define AIO_read 30
81 #define AIO_write 40
82 #define AIO_listio 50
83 #define AIO_error 60
84 #define AIO_error_val 61
85 #define AIO_error_activeq 62
86 #define AIO_error_workq 63
87 #define AIO_return 70
88 #define AIO_return_val 71
89 #define AIO_return_activeq 72
90 #define AIO_return_workq 73
91 #define AIO_exec 80
92 #define AIO_exit 90
93 #define AIO_exit_sleep 91
94 #define AIO_close 100
95 #define AIO_close_sleep 101
96 #define AIO_suspend 110
97 #define AIO_suspend_sleep 111
98 #define AIO_worker_thread 120
99
100 #if 0
101 #undef KERNEL_DEBUG
102 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
103 #endif
104
105 /*
106 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
107 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
108 * (proc.aio_activeq) when one of our worker threads start the IO.
109 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
110 * when the IO request completes. The request remains on aio_doneq until
111 * user process calls aio_return or the process exits, either way that is our
112 * trigger to release aio resources.
113 */
114 struct aio_anchor_cb
115 {
116 int aio_async_workq_count; /* entries on aio_async_workq */
117 int lio_sync_workq_count; /* entries on lio_sync_workq */
118 int aio_active_count; /* entries on all active queues (proc.aio_activeq) */
119 int aio_done_count; /* entries on all done queues (proc.aio_doneq) */
120 TAILQ_HEAD( , aio_workq_entry ) aio_async_workq;
121 TAILQ_HEAD( , aio_workq_entry ) lio_sync_workq;
122 };
123 typedef struct aio_anchor_cb aio_anchor_cb;
124
125
126 /*
127 * Notes on aio sleep / wake channels.
128 * We currently pick a couple fields within the proc structure that will allow
129 * us sleep channels that currently do not collide with any other kernel routines.
130 * At this time, for binary compatibility reasons, we cannot create new proc fields.
131 */
132 #define AIO_SUSPEND_SLEEP_CHAN p_estcpu
133 #define AIO_CLEANUP_SLEEP_CHAN p_pctcpu
134
135
136 /*
137 * aysnc IO locking macros used to protect critical sections.
138 */
139 #define AIO_LOCK lck_mtx_lock(aio_lock)
140 #define AIO_UNLOCK lck_mtx_unlock(aio_lock)
141
142
143 /*
144 * LOCAL PROTOTYPES
145 */
146 static int aio_active_requests_for_process( struct proc *procp );
147 static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
148 static int aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
149 static int aio_get_all_queues_count( void );
150 static int aio_get_process_count( struct proc *procp );
151 static aio_workq_entry * aio_get_some_work( void );
152 static boolean_t aio_last_group_io( aio_workq_entry *entryp );
153 static void aio_mark_requests( aio_workq_entry *entryp );
154 static int aio_queue_async_request( struct proc *procp,
155 user_addr_t aiocbp,
156 int kindOfIO );
157 static int aio_validate( aio_workq_entry *entryp );
158 static void aio_work_thread( void );
159 static int do_aio_cancel( struct proc *p,
160 int fd,
161 user_addr_t aiocbp,
162 boolean_t wait_for_completion,
163 boolean_t disable_notification );
164 static void do_aio_completion( aio_workq_entry *entryp );
165 static int do_aio_fsync( aio_workq_entry *entryp );
166 static int do_aio_read( aio_workq_entry *entryp );
167 static int do_aio_write( aio_workq_entry *entryp );
168 static void do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
169 static boolean_t is_already_queued( struct proc *procp,
170 user_addr_t aiocbp );
171 static int lio_create_async_entry( struct proc *procp,
172 user_addr_t aiocbp,
173 user_addr_t sigp,
174 long group_tag,
175 aio_workq_entry **entrypp );
176 static int lio_create_sync_entry( struct proc *procp,
177 user_addr_t aiocbp,
178 long group_tag,
179 aio_workq_entry **entrypp );
180
181
182 /*
183 * EXTERNAL PROTOTYPES
184 */
185
186 /* in ...bsd/kern/sys_generic.c */
187 extern int dofileread( struct proc *p, struct fileproc *fp, int fd,
188 user_addr_t bufp, user_size_t nbyte,
189 off_t offset, int flags, user_ssize_t *retval );
190 extern int dofilewrite( struct proc *p, struct fileproc *fp, int fd,
191 user_addr_t bufp, user_size_t nbyte, off_t offset,
192 int flags, user_ssize_t *retval );
193
194 /*
195 * aio external global variables.
196 */
197 extern int aio_max_requests; /* AIO_MAX - configurable */
198 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
199 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
200
201
202 /*
203 * aio static variables.
204 */
205 static aio_anchor_cb aio_anchor;
206 static lck_mtx_t * aio_lock;
207 static lck_grp_t * aio_lock_grp;
208 static lck_attr_t * aio_lock_attr;
209 static lck_grp_attr_t * aio_lock_grp_attr;
210 static struct zone *aio_workq_zonep;
211
212
213
214
215 /*
216 * aio_cancel - attempt to cancel one or more async IO requests currently
217 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
218 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
219 * is NULL then all outstanding async IO request for the given file
220 * descriptor are cancelled (if possible).
221 */
222
223 int
224 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
225 {
226 struct user_aiocb my_aiocb;
227 int result;
228
229 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
230 (int)p, (int)uap->aiocbp, 0, 0, 0 );
231
232 /* quick check to see if there are any async IO requests queued up */
233 AIO_LOCK;
234 result = aio_get_all_queues_count( );
235 AIO_UNLOCK;
236 if ( result < 1 ) {
237 result = EBADF;
238 goto ExitRoutine;
239 }
240
241 *retval = -1;
242 if ( uap->aiocbp != USER_ADDR_NULL ) {
243 if ( !IS_64BIT_PROCESS(p) ) {
244 struct aiocb aiocb32;
245
246 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
247 if ( result == 0 )
248 do_munge_aiocb( &aiocb32, &my_aiocb );
249 } else
250 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
251
252 if ( result != 0 ) {
253 result = EAGAIN;
254 goto ExitRoutine;
255 }
256
257 /* NOTE - POSIX standard says a mismatch between the file */
258 /* descriptor passed in and the file descriptor embedded in */
259 /* the aiocb causes unspecified results. We return EBADF in */
260 /* that situation. */
261 if ( uap->fd != my_aiocb.aio_fildes ) {
262 result = EBADF;
263 goto ExitRoutine;
264 }
265 }
266 result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
267
268 if ( result != -1 ) {
269 *retval = result;
270 result = 0;
271 goto ExitRoutine;
272 }
273
274 result = EBADF;
275
276 ExitRoutine:
277 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
278 (int)p, (int)uap->aiocbp, result, 0, 0 );
279
280 return( result );
281
282 } /* aio_cancel */
283
284
285 /*
286 * _aio_close - internal function used to clean up async IO requests for
287 * a file descriptor that is closing.
288 * THIS MAY BLOCK.
289 */
290
291 __private_extern__ void
292 _aio_close( struct proc *p, int fd )
293 {
294 int error, count;
295
296 /* quick check to see if there are any async IO requests queued up */
297 AIO_LOCK;
298 count = aio_get_all_queues_count( );
299 AIO_UNLOCK;
300 if ( count < 1 )
301 return;
302
303 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
304 (int)p, fd, 0, 0, 0 );
305
306 /* cancel all async IO requests on our todo queues for this file descriptor */
307 error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
308 if ( error == AIO_NOTCANCELED ) {
309 /*
310 * AIO_NOTCANCELED is returned when we find an aio request for this process
311 * and file descriptor on the active async IO queue. Active requests cannot
312 * be cancelled so we must wait for them to complete. We will get a special
313 * wake up call on our channel used to sleep for ALL active requests to
314 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
315 * when we must wait for all active aio requests.
316 */
317
318 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
319 (int)p, fd, 0, 0, 0 );
320
321 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
322 }
323
324 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
325 (int)p, fd, 0, 0, 0 );
326
327 return;
328
329 } /* _aio_close */
330
331
332 /*
333 * aio_error - return the error status associated with the async IO
334 * request referred to by uap->aiocbp. The error status is the errno
335 * value that would be set by the corresponding IO request (read, wrtie,
336 * fdatasync, or sync).
337 */
338
339 int
340 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
341 {
342 aio_workq_entry *entryp;
343 int error;
344
345 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
346 (int)p, (int)uap->aiocbp, 0, 0, 0 );
347
348 AIO_LOCK;
349
350 /* quick check to see if there are any async IO requests queued up */
351 if ( aio_get_all_queues_count( ) < 1 ) {
352 error = EINVAL;
353 goto ExitRoutine;
354 }
355
356 /* look for a match on our queue of async IO requests that have completed */
357 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
358 if ( entryp->uaiocbp == uap->aiocbp ) {
359 *retval = entryp->errorval;
360 error = 0;
361 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
362 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
363 goto ExitRoutine;
364 }
365 }
366
367 /* look for a match on our queue of active async IO requests */
368 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
369 if ( entryp->uaiocbp == uap->aiocbp ) {
370 *retval = EINPROGRESS;
371 error = 0;
372 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
373 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
374 goto ExitRoutine;
375 }
376 }
377
378 /* look for a match on our queue of todo work */
379 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
380 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
381 *retval = EINPROGRESS;
382 error = 0;
383 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
384 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
385 goto ExitRoutine;
386 }
387 }
388 error = EINVAL;
389
390 ExitRoutine:
391 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
392 (int)p, (int)uap->aiocbp, error, 0, 0 );
393 AIO_UNLOCK;
394
395 return( error );
396
397 } /* aio_error */
398
399
400 /*
401 * aio_fsync - asynchronously force all IO operations associated
402 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
403 * queued at the time of the call to the synchronized completion state.
404 * NOTE - we do not support op O_DSYNC at this point since we do not support the
405 * fdatasync() call.
406 */
407
408 int
409 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
410 {
411 int error;
412 int fsync_kind;
413
414 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
415 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
416
417 *retval = 0;
418 /* 0 := O_SYNC for binary backward compatibility with Panther */
419 if (uap->op == O_SYNC || uap->op == 0)
420 fsync_kind = AIO_FSYNC;
421 #if 0 // we don't support fdatasync() call yet
422 else if ( uap->op == O_DSYNC )
423 fsync_kind = AIO_DSYNC;
424 #endif
425 else {
426 *retval = -1;
427 error = EINVAL;
428 goto ExitRoutine;
429 }
430
431 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
432 if ( error != 0 )
433 *retval = -1;
434
435 ExitRoutine:
436 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
437 (int)p, (int)uap->aiocbp, error, 0, 0 );
438
439 return( error );
440
441 } /* aio_fsync */
442
443
444 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
445 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
446 * (uap->aiocbp->aio_buf).
447 */
448
449 int
450 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
451 {
452 int error;
453
454 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
455 (int)p, (int)uap->aiocbp, 0, 0, 0 );
456
457 *retval = 0;
458
459 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
460 if ( error != 0 )
461 *retval = -1;
462
463 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
464 (int)p, (int)uap->aiocbp, error, 0, 0 );
465
466 return( error );
467
468 } /* aio_read */
469
470
471 /*
472 * aio_return - return the return status associated with the async IO
473 * request referred to by uap->aiocbp. The return status is the value
474 * that would be returned by corresponding IO request (read, wrtie,
475 * fdatasync, or sync). This is where we release kernel resources
476 * held for async IO call associated with the given aiocb pointer.
477 */
478
479 int
480 aio_return( struct proc *p, struct aio_return_args *uap, user_ssize_t *retval )
481 {
482 aio_workq_entry *entryp;
483 int error;
484 boolean_t lock_held;
485
486 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
487 (int)p, (int)uap->aiocbp, 0, 0, 0 );
488
489 AIO_LOCK;
490 lock_held = TRUE;
491 *retval = 0;
492
493 /* quick check to see if there are any async IO requests queued up */
494 if ( aio_get_all_queues_count( ) < 1 ) {
495 error = EINVAL;
496 goto ExitRoutine;
497 }
498
499 /* look for a match on our queue of async IO requests that have completed */
500 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
501 if ( entryp->uaiocbp == uap->aiocbp ) {
502 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
503 aio_anchor.aio_done_count--;
504 p->aio_done_count--;
505
506 *retval = entryp->returnval;
507
508 /* we cannot free requests that are still completing */
509 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
510 vm_map_t my_map;
511
512 my_map = entryp->aio_map;
513 entryp->aio_map = VM_MAP_NULL;
514 AIO_UNLOCK;
515 lock_held = FALSE;
516 aio_free_request( entryp, my_map );
517 }
518 else
519 /* tell completion code to free this request */
520 entryp->flags |= AIO_DO_FREE;
521 error = 0;
522 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
523 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
524 goto ExitRoutine;
525 }
526 }
527
528 /* look for a match on our queue of active async IO requests */
529 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
530 if ( entryp->uaiocbp == uap->aiocbp ) {
531 error = EINPROGRESS;
532 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
533 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
534 goto ExitRoutine;
535 }
536 }
537
538 /* look for a match on our queue of todo work */
539 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
540 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
541 error = EINPROGRESS;
542 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
543 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
544 goto ExitRoutine;
545 }
546 }
547 error = EINVAL;
548
549 ExitRoutine:
550 if ( lock_held )
551 AIO_UNLOCK;
552 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
553 (int)p, (int)uap->aiocbp, error, 0, 0 );
554
555 return( error );
556
557 } /* aio_return */
558
559
560 /*
561 * _aio_exec - internal function used to clean up async IO requests for
562 * a process that is going away due to exec(). We cancel any async IOs
563 * we can and wait for those already active. We also disable signaling
564 * for cancelled or active aio requests that complete.
565 * This routine MAY block!
566 */
567
568 __private_extern__ void
569 _aio_exec( struct proc *p )
570 {
571
572 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
573 (int)p, 0, 0, 0, 0 );
574
575 _aio_exit( p );
576
577 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
578 (int)p, 0, 0, 0, 0 );
579
580 return;
581
582 } /* _aio_exec */
583
584
585 /*
586 * _aio_exit - internal function used to clean up async IO requests for
587 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
588 * we can and wait for those already active. We also disable signaling
589 * for cancelled or active aio requests that complete. This routine MAY block!
590 */
591
592 __private_extern__ void
593 _aio_exit( struct proc *p )
594 {
595 int error, count;
596 aio_workq_entry *entryp;
597
598 /* quick check to see if there are any async IO requests queued up */
599 AIO_LOCK;
600 count = aio_get_all_queues_count( );
601 AIO_UNLOCK;
602 if ( count < 1 ) {
603 return;
604 }
605
606 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
607 (int)p, 0, 0, 0, 0 );
608
609 /*
610 * cancel async IO requests on the todo work queue and wait for those
611 * already active to complete.
612 */
613 error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
614 if ( error == AIO_NOTCANCELED ) {
615 /*
616 * AIO_NOTCANCELED is returned when we find an aio request for this process
617 * on the active async IO queue. Active requests cannot be cancelled so we
618 * must wait for them to complete. We will get a special wake up call on
619 * our channel used to sleep for ALL active requests to complete. This sleep
620 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
621 * active aio requests.
622 */
623
624 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
625 (int)p, 0, 0, 0, 0 );
626
627 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
628 }
629
630 /* release all aio resources used by this process */
631 AIO_LOCK;
632 entryp = TAILQ_FIRST( &p->aio_doneq );
633 while ( entryp != NULL ) {
634 aio_workq_entry *next_entryp;
635
636 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
637 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
638 aio_anchor.aio_done_count--;
639 p->aio_done_count--;
640
641 /* we cannot free requests that are still completing */
642 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
643 vm_map_t my_map;
644
645 my_map = entryp->aio_map;
646 entryp->aio_map = VM_MAP_NULL;
647 AIO_UNLOCK;
648 aio_free_request( entryp, my_map );
649
650 /* need to start over since aio_doneq may have been */
651 /* changed while we were away. */
652 AIO_LOCK;
653 entryp = TAILQ_FIRST( &p->aio_doneq );
654 continue;
655 }
656 else
657 /* tell completion code to free this request */
658 entryp->flags |= AIO_DO_FREE;
659 entryp = next_entryp;
660 }
661 AIO_UNLOCK;
662
663 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
664 (int)p, 0, 0, 0, 0 );
665
666 return;
667
668 } /* _aio_exit */
669
670
671 /*
672 * do_aio_cancel - cancel async IO requests (if possible). We get called by
673 * aio_cancel, close, and at exit.
674 * There are three modes of operation: 1) cancel all async IOs for a process -
675 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
676 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
677 * aiocbp.
678 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
679 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
680 * target async IO requests, and AIO_ALLDONE if all target async IO requests
681 * were already complete.
682 * WARNING - do not deference aiocbp in this routine, it may point to user
683 * land data that has not been copied in (when called from aio_cancel() )
684 */
685
686 static int
687 do_aio_cancel( struct proc *p, int fd, user_addr_t aiocbp,
688 boolean_t wait_for_completion, boolean_t disable_notification )
689 {
690 aio_workq_entry *entryp;
691 int result;
692
693 result = -1;
694
695 /* look for a match on our queue of async todo work. */
696 AIO_LOCK;
697 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
698 while ( entryp != NULL ) {
699 aio_workq_entry *next_entryp;
700
701 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
702 if ( p == entryp->procp ) {
703 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
704 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
705 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
706 /* we found a match so we remove the entry from the */
707 /* todo work queue and place it on the done queue */
708 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
709 aio_anchor.aio_async_workq_count--;
710 entryp->errorval = ECANCELED;
711 entryp->returnval = -1;
712 if ( disable_notification )
713 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
714 result = AIO_CANCELED;
715
716 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
717 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
718
719 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
720 aio_anchor.aio_done_count++;
721 p->aio_done_count++;
722 entryp->flags |= AIO_COMPLETION;
723 AIO_UNLOCK;
724
725 /* do completion processing for this request */
726 do_aio_completion( entryp );
727
728 AIO_LOCK;
729 entryp->flags &= ~AIO_COMPLETION;
730 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
731 vm_map_t my_map;
732
733 my_map = entryp->aio_map;
734 entryp->aio_map = VM_MAP_NULL;
735 AIO_UNLOCK;
736 aio_free_request( entryp, my_map );
737 }
738 else
739 AIO_UNLOCK;
740
741 if ( aiocbp != USER_ADDR_NULL ) {
742 return( result );
743 }
744
745 /* need to start over since aio_async_workq may have been */
746 /* changed while we were away doing completion processing. */
747 AIO_LOCK;
748 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
749 continue;
750 }
751 }
752 entryp = next_entryp;
753 } /* while... */
754
755 /*
756 * look for a match on our queue of synchronous todo work. This will
757 * be a rare occurrence but could happen if a process is terminated while
758 * processing a lio_listio call.
759 */
760 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
761 while ( entryp != NULL ) {
762 aio_workq_entry *next_entryp;
763
764 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
765 if ( p == entryp->procp ) {
766 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
767 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
768 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
769 /* we found a match so we remove the entry from the */
770 /* todo work queue and place it on the done queue */
771 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
772 aio_anchor.lio_sync_workq_count--;
773 entryp->errorval = ECANCELED;
774 entryp->returnval = -1;
775 if ( disable_notification )
776 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
777 result = AIO_CANCELED;
778
779 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
780 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
781
782 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
783 aio_anchor.aio_done_count++;
784 p->aio_done_count++;
785 if ( aiocbp != USER_ADDR_NULL ) {
786 AIO_UNLOCK;
787 return( result );
788 }
789 }
790 }
791 entryp = next_entryp;
792 } /* while... */
793
794 /*
795 * look for a match on our queue of active async IO requests and
796 * return AIO_NOTCANCELED result.
797 */
798 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
799 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
800 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
801 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
802 result = AIO_NOTCANCELED;
803
804 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
805 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
806
807 if ( wait_for_completion )
808 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
809 if ( disable_notification )
810 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
811 if ( aiocbp != USER_ADDR_NULL ) {
812 AIO_UNLOCK;
813 return( result );
814 }
815 }
816 }
817
818 /*
819 * if we didn't find any matches on the todo or active queues then look for a
820 * match on our queue of async IO requests that have completed and if found
821 * return AIO_ALLDONE result.
822 */
823 if ( result == -1 ) {
824 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
825 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
826 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
827 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
828 result = AIO_ALLDONE;
829
830 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
831 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
832
833 if ( aiocbp != USER_ADDR_NULL ) {
834 AIO_UNLOCK;
835 return( result );
836 }
837 }
838 }
839 }
840 AIO_UNLOCK;
841
842 return( result );
843
844 } /* do_aio_cancel */
845
846
847 /*
848 * aio_suspend - suspend the calling thread until at least one of the async
849 * IO operations referenced by uap->aiocblist has completed, until a signal
850 * interrupts the function, or uap->timeoutp time interval (optional) has
851 * passed.
852 * Returns 0 if one or more async IOs have completed else -1 and errno is
853 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
854 * woke us up.
855 */
856
857 int
858 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
859 {
860 int error;
861 int i, count;
862 uint64_t abstime;
863 struct user_timespec ts;
864 aio_workq_entry *entryp;
865 user_addr_t *aiocbpp;
866
867 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
868 (int)p, uap->nent, 0, 0, 0 );
869
870 *retval = -1;
871 abstime = 0;
872 aiocbpp = NULL;
873
874 /* quick check to see if there are any async IO requests queued up */
875 AIO_LOCK;
876 count = aio_get_all_queues_count( );
877 AIO_UNLOCK;
878 if ( count < 1 ) {
879 error = EINVAL;
880 goto ExitThisRoutine;
881 }
882
883 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
884 error = EINVAL;
885 goto ExitThisRoutine;
886 }
887
888 if ( uap->timeoutp != USER_ADDR_NULL ) {
889 if ( proc_is64bit(p) ) {
890 error = copyin( uap->timeoutp, &ts, sizeof(ts) );
891 }
892 else {
893 struct timespec temp;
894 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
895 if ( error == 0 ) {
896 ts.tv_sec = temp.tv_sec;
897 ts.tv_nsec = temp.tv_nsec;
898 }
899 }
900 if ( error != 0 ) {
901 error = EAGAIN;
902 goto ExitThisRoutine;
903 }
904
905 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
906 error = EINVAL;
907 goto ExitThisRoutine;
908 }
909
910 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
911 &abstime );
912 clock_absolutetime_interval_to_deadline( abstime, &abstime );
913 }
914
915 /* we reserve enough space for largest possible pointer size */
916 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
917 if ( aiocbpp == NULL ) {
918 error = EAGAIN;
919 goto ExitThisRoutine;
920 }
921
922 /* copyin our aiocb pointers from list */
923 error = copyin( uap->aiocblist, aiocbpp,
924 proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
925 : (uap->nent * sizeof(uintptr_t)) );
926 if ( error != 0 ) {
927 error = EAGAIN;
928 goto ExitThisRoutine;
929 }
930
931 /* we depend on a list of user_addr_t's so we need to munge and expand */
932 /* when these pointers came from a 32-bit process */
933 if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
934 /* position to the last entry and work back from there */
935 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
936 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
937 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
938 *my_addrp = (user_addr_t) (*my_ptrp);
939 }
940 }
941
942 /* check list of aio requests to see if any have completed */
943 AIO_LOCK;
944 for ( i = 0; i < uap->nent; i++ ) {
945 user_addr_t aiocbp;
946
947 /* NULL elements are legal so check for 'em */
948 aiocbp = *(aiocbpp + i);
949 if ( aiocbp == USER_ADDR_NULL )
950 continue;
951
952 /* return immediately if any aio request in the list is done */
953 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
954 if ( entryp->uaiocbp == aiocbp ) {
955 *retval = 0;
956 error = 0;
957 AIO_UNLOCK;
958 goto ExitThisRoutine;
959 }
960 }
961 } /* for ( ; i < uap->nent; ) */
962
963 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
964 (int)p, uap->nent, 0, 0, 0 );
965
966 /*
967 * wait for an async IO to complete or a signal fires or timeout expires.
968 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
969 * interrupts us. If an async IO completes before a signal fires or our
970 * timeout expires, we get a wakeup call from aio_work_thread().
971 */
972 assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
973 AIO_UNLOCK;
974
975 error = thread_block( THREAD_CONTINUE_NULL );
976
977 if ( error == THREAD_AWAKENED ) {
978 /* got our wakeup call from aio_work_thread() */
979 *retval = 0;
980 error = 0;
981 }
982 else if ( error == THREAD_TIMED_OUT ) {
983 /* our timeout expired */
984 error = EAGAIN;
985 }
986 else {
987 /* we were interrupted */
988 error = EINTR;
989 }
990
991 ExitThisRoutine:
992 if ( aiocbpp != NULL )
993 FREE( aiocbpp, M_TEMP );
994
995 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
996 (int)p, uap->nent, error, 0, 0 );
997
998 return( error );
999
1000 } /* aio_suspend */
1001
1002
1003 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1004 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1005 * (uap->aiocbp->aio_buf).
1006 */
1007
1008 int
1009 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1010 {
1011 int error;
1012
1013 *retval = 0;
1014
1015 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1016 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1017
1018 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1019 if ( error != 0 )
1020 *retval = -1;
1021
1022 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1023 (int)p, (int)uap->aiocbp, error, 0, 0 );
1024
1025 return( error );
1026
1027 } /* aio_write */
1028
1029
1030 /*
1031 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1032 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1033 * The caller gets error and return status for each aiocb in the list via aio_error
1034 * and aio_return. We must keep completed requests until released by the
1035 * aio_return call.
1036 */
1037
1038 int
1039 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1040 {
1041 int i;
1042 int call_result;
1043 int result;
1044 long group_tag;
1045 aio_workq_entry * *entryp_listp;
1046 user_addr_t *aiocbpp;
1047
1048 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1049 (int)p, uap->nent, uap->mode, 0, 0 );
1050
1051 entryp_listp = NULL;
1052 aiocbpp = NULL;
1053 call_result = -1;
1054 *retval = -1;
1055 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1056 call_result = EINVAL;
1057 goto ExitRoutine;
1058 }
1059
1060 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1061 call_result = EINVAL;
1062 goto ExitRoutine;
1063 }
1064
1065 /*
1066 * we use group_tag to mark IO requests for delayed completion processing
1067 * which means we wait until all IO requests in the group have completed
1068 * before we either return to the caller when mode is LIO_WAIT or signal
1069 * user when mode is LIO_NOWAIT.
1070 */
1071 group_tag = random();
1072
1073 /*
1074 * allocate a list of aio_workq_entry pointers that we will use to queue
1075 * up all our requests at once while holding our lock.
1076 */
1077 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1078 if ( entryp_listp == NULL ) {
1079 call_result = EAGAIN;
1080 goto ExitRoutine;
1081 }
1082
1083 /* we reserve enough space for largest possible pointer size */
1084 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1085 if ( aiocbpp == NULL ) {
1086 call_result = EAGAIN;
1087 goto ExitRoutine;
1088 }
1089
1090 /* copyin our aiocb pointers from list */
1091 result = copyin( uap->aiocblist, aiocbpp,
1092 IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1093 : (uap->nent * sizeof(uintptr_t)) );
1094 if ( result != 0 ) {
1095 call_result = EAGAIN;
1096 goto ExitRoutine;
1097 }
1098
1099 /* we depend on a list of user_addr_t's so we need to munge and expand */
1100 /* when these pointers came from a 32-bit process */
1101 if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1102 /* position to the last entry and work back from there */
1103 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1104 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1105 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1106 *my_addrp = (user_addr_t) (*my_ptrp);
1107 }
1108 }
1109
1110 /* process list of aio requests */
1111 for ( i = 0; i < uap->nent; i++ ) {
1112 user_addr_t my_aiocbp;
1113
1114 *(entryp_listp + i) = NULL;
1115 my_aiocbp = *(aiocbpp + i);
1116
1117 /* NULL elements are legal so check for 'em */
1118 if ( my_aiocbp == USER_ADDR_NULL )
1119 continue;
1120
1121 if ( uap->mode == LIO_NOWAIT )
1122 result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1123 group_tag, (entryp_listp + i) );
1124 else
1125 result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1126 (entryp_listp + i) );
1127
1128 if ( result != 0 && call_result == -1 )
1129 call_result = result;
1130 }
1131
1132 /*
1133 * we need to protect this section since we do not want any of these grouped
1134 * IO requests to begin until we have them all on the queue.
1135 */
1136 AIO_LOCK;
1137 for ( i = 0; i < uap->nent; i++ ) {
1138 aio_workq_entry *entryp;
1139
1140 /* NULL elements are legal so check for 'em */
1141 entryp = *(entryp_listp + i);
1142 if ( entryp == NULL )
1143 continue;
1144
1145 /* check our aio limits to throttle bad or rude user land behavior */
1146 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1147 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1148 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1149 vm_map_t my_map;
1150
1151 my_map = entryp->aio_map;
1152 entryp->aio_map = VM_MAP_NULL;
1153 if ( call_result == -1 )
1154 call_result = EAGAIN;
1155 AIO_UNLOCK;
1156 aio_free_request( entryp, my_map );
1157 AIO_LOCK;
1158 continue;
1159 }
1160
1161 /* place the request on the appropriate queue */
1162 if ( uap->mode == LIO_NOWAIT ) {
1163 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1164 aio_anchor.aio_async_workq_count++;
1165
1166 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1167 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1168 }
1169 else {
1170 TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1171 aio_anchor.lio_sync_workq_count++;
1172 }
1173 }
1174
1175 if ( uap->mode == LIO_NOWAIT ) {
1176 /* caller does not want to wait so we'll fire off a worker thread and return */
1177 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1178 }
1179 else {
1180 aio_workq_entry *entryp;
1181 int error;
1182
1183 /*
1184 * mode is LIO_WAIT - handle the IO requests now.
1185 */
1186 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1187 while ( entryp != NULL ) {
1188 if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1189
1190 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1191 aio_anchor.lio_sync_workq_count--;
1192 AIO_UNLOCK;
1193
1194 if ( (entryp->flags & AIO_READ) != 0 ) {
1195 error = do_aio_read( entryp );
1196 }
1197 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1198 error = do_aio_write( entryp );
1199 }
1200 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1201 error = do_aio_fsync( entryp );
1202 }
1203 else {
1204 printf( "%s - unknown aio request - flags 0x%02X \n",
1205 __FUNCTION__, entryp->flags );
1206 error = EINVAL;
1207 }
1208 entryp->errorval = error;
1209 if ( error != 0 && call_result == -1 )
1210 call_result = EIO;
1211
1212 AIO_LOCK;
1213 /* we're done with the IO request so move it on the done queue */
1214 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1215 aio_anchor.aio_done_count++;
1216 p->aio_done_count++;
1217
1218 /* need to start over since lio_sync_workq may have been changed while we */
1219 /* were away doing the IO. */
1220 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1221 continue;
1222 } /* p == entryp->procp */
1223
1224 entryp = TAILQ_NEXT( entryp, aio_workq_link );
1225 } /* while ( entryp != NULL ) */
1226 } /* uap->mode == LIO_WAIT */
1227 AIO_UNLOCK;
1228
1229 /* call_result == -1 means we had no trouble queueing up requests */
1230 if ( call_result == -1 ) {
1231 call_result = 0;
1232 *retval = 0;
1233 }
1234
1235 ExitRoutine:
1236 if ( entryp_listp != NULL )
1237 FREE( entryp_listp, M_TEMP );
1238 if ( aiocbpp != NULL )
1239 FREE( aiocbpp, M_TEMP );
1240
1241 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1242 (int)p, call_result, 0, 0, 0 );
1243
1244 return( call_result );
1245
1246 } /* lio_listio */
1247
1248
1249 /*
1250 * aio worker thread. this is where all the real work gets done.
1251 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1252 * after new work is queued up.
1253 */
1254
1255 static void
1256 aio_work_thread( void )
1257 {
1258 aio_workq_entry *entryp;
1259
1260 for( ;; ) {
1261 AIO_LOCK;
1262 entryp = aio_get_some_work();
1263 if ( entryp == NULL ) {
1264 /*
1265 * aio worker threads wait for some work to get queued up
1266 * by aio_queue_async_request. Once some work gets queued
1267 * it will wake up one of these worker threads just before
1268 * returning to our caller in user land.
1269 */
1270 assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1271 AIO_UNLOCK;
1272
1273 thread_block( (thread_continue_t)aio_work_thread );
1274 /* NOT REACHED */
1275 }
1276 else {
1277 int error;
1278 vm_map_t currentmap;
1279 vm_map_t oldmap = VM_MAP_NULL;
1280 task_t oldaiotask = TASK_NULL;
1281 struct uthread *uthreadp = NULL;
1282
1283 AIO_UNLOCK;
1284
1285 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1286 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1287
1288 /*
1289 * Assume the target's address space identity for the duration
1290 * of the IO.
1291 */
1292 currentmap = get_task_map( (current_proc())->task );
1293 if ( currentmap != entryp->aio_map ) {
1294 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1295 oldaiotask = uthreadp->uu_aio_task;
1296 uthreadp->uu_aio_task = entryp->procp->task;
1297 oldmap = vm_map_switch( entryp->aio_map );
1298 }
1299
1300 if ( (entryp->flags & AIO_READ) != 0 ) {
1301 error = do_aio_read( entryp );
1302 }
1303 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1304 error = do_aio_write( entryp );
1305 }
1306 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1307 error = do_aio_fsync( entryp );
1308 }
1309 else {
1310 printf( "%s - unknown aio request - flags 0x%02X \n",
1311 __FUNCTION__, entryp->flags );
1312 error = EINVAL;
1313 }
1314 entryp->errorval = error;
1315 if ( currentmap != entryp->aio_map ) {
1316 (void) vm_map_switch( oldmap );
1317 uthreadp->uu_aio_task = oldaiotask;
1318 }
1319
1320 /* we're done with the IO request so pop it off the active queue and */
1321 /* push it on the done queue */
1322 AIO_LOCK;
1323 TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1324 aio_anchor.aio_active_count--;
1325 entryp->procp->aio_active_count--;
1326 TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1327 aio_anchor.aio_done_count++;
1328 entryp->procp->aio_done_count++;
1329 entryp->flags |= AIO_COMPLETION;
1330
1331 /* remove our reference to the user land map. */
1332 if ( VM_MAP_NULL != entryp->aio_map ) {
1333 vm_map_t my_map;
1334
1335 my_map = entryp->aio_map;
1336 entryp->aio_map = VM_MAP_NULL;
1337 AIO_UNLOCK; /* must unlock before calling vm_map_deallocate() */
1338 vm_map_deallocate( my_map );
1339 }
1340 else {
1341 AIO_UNLOCK;
1342 }
1343
1344 do_aio_completion( entryp );
1345
1346 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1347 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1348 entryp->returnval, 0 );
1349
1350 AIO_LOCK;
1351 entryp->flags &= ~AIO_COMPLETION;
1352 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1353 vm_map_t my_map;
1354
1355 my_map = entryp->aio_map;
1356 entryp->aio_map = VM_MAP_NULL;
1357 AIO_UNLOCK;
1358 aio_free_request( entryp, my_map );
1359 }
1360 else
1361 AIO_UNLOCK;
1362 }
1363 } /* for ( ;; ) */
1364
1365 /* NOT REACHED */
1366
1367 } /* aio_work_thread */
1368
1369
1370 /*
1371 * aio_get_some_work - get the next async IO request that is ready to be executed.
1372 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1373 * IO requests at the time the aio_fsync call came in have completed.
1374 * NOTE - AIO_LOCK must be held by caller
1375 */
1376
1377 static aio_workq_entry *
1378 aio_get_some_work( void )
1379 {
1380 aio_workq_entry *entryp;
1381
1382 /* pop some work off the work queue and add to our active queue */
1383 for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1384 entryp != NULL;
1385 entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1386
1387 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1388 /* leave aio_fsync calls on the work queue if there are IO */
1389 /* requests on the active queue for the same file descriptor. */
1390 if ( aio_delay_fsync_request( entryp ) ) {
1391
1392 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1393 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1394 continue;
1395 }
1396 }
1397 break;
1398 }
1399
1400 if ( entryp != NULL ) {
1401 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1402 aio_anchor.aio_async_workq_count--;
1403 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1404 aio_anchor.aio_active_count++;
1405 entryp->procp->aio_active_count++;
1406 }
1407
1408 return( entryp );
1409
1410 } /* aio_get_some_work */
1411
1412
1413 /*
1414 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1415 * this time. Delay will happen when there are any active IOs for the same file
1416 * descriptor that were queued at time the aio_sync call was queued.
1417 * NOTE - AIO_LOCK must be held by caller
1418 */
1419 static boolean_t
1420 aio_delay_fsync_request( aio_workq_entry *entryp )
1421 {
1422 aio_workq_entry *my_entryp;
1423
1424 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1425 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1426 entryp->uaiocbp == my_entryp->fsyncp &&
1427 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1428 return( TRUE );
1429 }
1430 }
1431
1432 return( FALSE );
1433
1434 } /* aio_delay_fsync_request */
1435
1436
1437 /*
1438 * aio_queue_async_request - queue up an async IO request on our work queue then
1439 * wake up one of our worker threads to do the actual work. We get a reference
1440 * to our caller's user land map in order to keep it around while we are
1441 * processing the request.
1442 */
1443
1444 static int
1445 aio_queue_async_request( struct proc *procp, user_addr_t aiocbp, int kindOfIO )
1446 {
1447 aio_workq_entry *entryp;
1448 int result;
1449
1450 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1451 if ( entryp == NULL ) {
1452 result = EAGAIN;
1453 goto error_exit;
1454 }
1455 bzero( entryp, sizeof(*entryp) );
1456
1457 /* fill in the rest of the aio_workq_entry */
1458 entryp->procp = procp;
1459 entryp->uaiocbp = aiocbp;
1460 entryp->flags |= kindOfIO;
1461 entryp->aio_map = VM_MAP_NULL;
1462
1463 if ( !IS_64BIT_PROCESS(procp) ) {
1464 struct aiocb aiocb32;
1465
1466 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1467 if ( result == 0 )
1468 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1469 } else
1470 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1471
1472 if ( result != 0 ) {
1473 result = EAGAIN;
1474 goto error_exit;
1475 }
1476
1477 /* do some more validation on the aiocb and embedded file descriptor */
1478 result = aio_validate( entryp );
1479 if ( result != 0 )
1480 goto error_exit;
1481
1482 /* get a reference to the user land map in order to keep it around */
1483 entryp->aio_map = get_task_map( procp->task );
1484 vm_map_reference( entryp->aio_map );
1485
1486 AIO_LOCK;
1487
1488 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1489 AIO_UNLOCK;
1490 result = EAGAIN;
1491 goto error_exit;
1492 }
1493
1494 /* check our aio limits to throttle bad or rude user land behavior */
1495 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1496 aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1497 AIO_UNLOCK;
1498 result = EAGAIN;
1499 goto error_exit;
1500 }
1501
1502 /*
1503 * aio_fsync calls sync up all async IO requests queued at the time
1504 * the aio_fsync call was made. So we mark each currently queued async
1505 * IO with a matching file descriptor as must complete before we do the
1506 * fsync. We set the fsyncp field of each matching async IO
1507 * request with the aiocb pointer passed in on the aio_fsync call to
1508 * know which IOs must complete before we process the aio_fsync call.
1509 */
1510 if ( (kindOfIO & AIO_FSYNC) != 0 )
1511 aio_mark_requests( entryp );
1512
1513 /* queue up on our aio asynchronous work queue */
1514 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1515 aio_anchor.aio_async_workq_count++;
1516
1517 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1518 AIO_UNLOCK;
1519
1520 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1521 (int)procp, (int)aiocbp, 0, 0, 0 );
1522
1523 return( 0 );
1524
1525 error_exit:
1526 if ( entryp != NULL ) {
1527 /* this entry has not been queued up so no worries about unlocked */
1528 /* state and aio_map */
1529 aio_free_request( entryp, entryp->aio_map );
1530 }
1531
1532 return( result );
1533
1534 } /* aio_queue_async_request */
1535
1536
1537 /*
1538 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1539 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1540 * our caller. We get a reference to our caller's user land map in order to keep
1541 * it around while we are processing the request.
1542 * lio_listio calls behave differently at completion they do completion notification
1543 * when all async IO requests have completed. We use group_tag to tag IO requests
1544 * that behave in the delay notification manner.
1545 */
1546
1547 static int
1548 lio_create_async_entry( struct proc *procp, user_addr_t aiocbp,
1549 user_addr_t sigp, long group_tag,
1550 aio_workq_entry **entrypp )
1551 {
1552 aio_workq_entry *entryp;
1553 int result;
1554
1555 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1556 if ( entryp == NULL ) {
1557 result = EAGAIN;
1558 goto error_exit;
1559 }
1560 bzero( entryp, sizeof(*entryp) );
1561
1562 /* fill in the rest of the aio_workq_entry */
1563 entryp->procp = procp;
1564 entryp->uaiocbp = aiocbp;
1565 entryp->flags |= AIO_LIO;
1566 entryp->group_tag = group_tag;
1567 entryp->aio_map = VM_MAP_NULL;
1568
1569 if ( !IS_64BIT_PROCESS(procp) ) {
1570 struct aiocb aiocb32;
1571
1572 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1573 if ( result == 0 )
1574 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1575 } else
1576 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1577
1578 if ( result != 0 ) {
1579 result = EAGAIN;
1580 goto error_exit;
1581 }
1582
1583 /* look for lio_listio LIO_NOP requests and ignore them. */
1584 /* Not really an error, but we need to free our aio_workq_entry. */
1585 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1586 result = 0;
1587 goto error_exit;
1588 }
1589
1590 /* use sigevent passed in to lio_listio for each of our calls, but only */
1591 /* do completion notification after the last request completes. */
1592 if ( sigp != USER_ADDR_NULL ) {
1593 if ( !IS_64BIT_PROCESS(procp) ) {
1594 struct sigevent sigevent32;
1595
1596 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1597 if ( result == 0 ) {
1598 /* also need to munge aio_sigevent since it contains pointers */
1599 /* special case here. since we do not know if sigev_value is an */
1600 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
1601 /* means if we send this info back to user space we need to remember */
1602 /* sigev_value was not expanded for the 32-bit case. */
1603 /* NOTE - this does NOT affect us since we don't support sigev_value */
1604 /* yet in the aio context. */
1605 //LP64
1606 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1607 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1608 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1609 sigevent32.sigev_value.sival_int;
1610 entryp->aiocb.aio_sigevent.sigev_notify_function =
1611 CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1612 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1613 CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1614 }
1615 } else
1616 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1617
1618 if ( result != 0 ) {
1619 result = EAGAIN;
1620 goto error_exit;
1621 }
1622 }
1623
1624 /* do some more validation on the aiocb and embedded file descriptor */
1625 result = aio_validate( entryp );
1626 if ( result != 0 )
1627 goto error_exit;
1628
1629 /* get a reference to the user land map in order to keep it around */
1630 entryp->aio_map = get_task_map( procp->task );
1631 vm_map_reference( entryp->aio_map );
1632
1633 *entrypp = entryp;
1634 return( 0 );
1635
1636 error_exit:
1637 if ( entryp != NULL )
1638 zfree( aio_workq_zonep, entryp );
1639
1640 return( result );
1641
1642 } /* lio_create_async_entry */
1643
1644
1645 /*
1646 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1647 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1648 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1649 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1650 * NOTE - AIO_LOCK must be held by caller
1651 */
1652
1653 static void
1654 aio_mark_requests( aio_workq_entry *entryp )
1655 {
1656 aio_workq_entry *my_entryp;
1657
1658 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1659 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1660 my_entryp->fsyncp = entryp->uaiocbp;
1661 }
1662 }
1663
1664 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1665 if ( entryp->procp == my_entryp->procp &&
1666 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1667 my_entryp->fsyncp = entryp->uaiocbp;
1668 }
1669 }
1670
1671 } /* aio_mark_requests */
1672
1673
1674 /*
1675 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1676 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1677 * our caller.
1678 * lio_listio calls behave differently at completion they do completion notification
1679 * when all async IO requests have completed. We use group_tag to tag IO requests
1680 * that behave in the delay notification manner.
1681 */
1682
1683 static int
1684 lio_create_sync_entry( struct proc *procp, user_addr_t aiocbp,
1685 long group_tag, aio_workq_entry **entrypp )
1686 {
1687 aio_workq_entry *entryp;
1688 int result;
1689
1690 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1691 if ( entryp == NULL ) {
1692 result = EAGAIN;
1693 goto error_exit;
1694 }
1695 bzero( entryp, sizeof(*entryp) );
1696
1697 /* fill in the rest of the aio_workq_entry */
1698 entryp->procp = procp;
1699 entryp->uaiocbp = aiocbp;
1700 entryp->flags |= AIO_LIO;
1701 entryp->group_tag = group_tag;
1702 entryp->aio_map = VM_MAP_NULL;
1703
1704 if ( !IS_64BIT_PROCESS(procp) ) {
1705 struct aiocb aiocb32;
1706
1707 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1708 if ( result == 0 )
1709 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1710 } else
1711 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1712
1713 if ( result != 0 ) {
1714 result = EAGAIN;
1715 goto error_exit;
1716 }
1717
1718 /* look for lio_listio LIO_NOP requests and ignore them. */
1719 /* Not really an error, but we need to free our aio_workq_entry. */
1720 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1721 result = 0;
1722 goto error_exit;
1723 }
1724
1725 result = aio_validate( entryp );
1726 if ( result != 0 ) {
1727 goto error_exit;
1728 }
1729
1730 *entrypp = entryp;
1731 return( 0 );
1732
1733 error_exit:
1734 if ( entryp != NULL )
1735 zfree( aio_workq_zonep, entryp );
1736
1737 return( result );
1738
1739 } /* lio_create_sync_entry */
1740
1741
1742 /*
1743 * aio_free_request - remove our reference on the user land map and
1744 * free the work queue entry resources.
1745 * We are not holding the lock here thus aio_map is passed in and
1746 * zeroed while we did have the lock.
1747 */
1748
1749 static int
1750 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1751 {
1752 /* remove our reference to the user land map. */
1753 if ( VM_MAP_NULL != the_map ) {
1754 vm_map_deallocate( the_map );
1755 }
1756
1757 zfree( aio_workq_zonep, entryp );
1758
1759 return( 0 );
1760
1761 } /* aio_free_request */
1762
1763
1764 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1765 */
1766
1767 static int
1768 aio_validate( aio_workq_entry *entryp )
1769 {
1770 struct fileproc *fp;
1771 int flag;
1772 int result;
1773
1774 result = 0;
1775
1776 if ( (entryp->flags & AIO_LIO) != 0 ) {
1777 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1778 entryp->flags |= AIO_READ;
1779 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1780 entryp->flags |= AIO_WRITE;
1781 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1782 return( 0 );
1783 else
1784 return( EINVAL );
1785 }
1786
1787 flag = FREAD;
1788 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1789 flag = FWRITE;
1790 }
1791
1792 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1793 // LP64todo - does max value for aio_nbytes need to grow?
1794 if ( entryp->aiocb.aio_nbytes > INT_MAX ||
1795 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1796 entryp->aiocb.aio_offset < 0 )
1797 return( EINVAL );
1798 }
1799
1800 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1801 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1802 * sigev_notify_function, and sigev_notify_attributes are ignored.
1803 */
1804 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1805 int signum;
1806 /* make sure we have a valid signal number */
1807 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1808 if ( signum <= 0 || signum >= NSIG ||
1809 signum == SIGKILL || signum == SIGSTOP )
1810 return (EINVAL);
1811 }
1812 else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1813 return (EINVAL);
1814
1815 /* validate the file descriptor and that the file was opened
1816 * for the appropriate read / write access.
1817 */
1818 proc_fdlock(entryp->procp);
1819
1820 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1821 if ( result == 0 ) {
1822 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1823 /* we don't have read or write access */
1824 result = EBADF;
1825 }
1826 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1827 /* this is not a file */
1828 result = ESPIPE;
1829 } else
1830 fp->f_flags |= FP_AIOISSUED;
1831
1832 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1833 }
1834 else {
1835 result = EBADF;
1836 }
1837
1838 proc_fdunlock(entryp->procp);
1839
1840 return( result );
1841
1842 } /* aio_validate */
1843
1844
1845 /*
1846 * aio_get_process_count - runs through our queues that hold outstanding
1847 * async IO reqests and totals up number of requests for the given
1848 * process.
1849 * NOTE - caller must hold aio lock!
1850 */
1851
1852 static int
1853 aio_get_process_count( struct proc *procp )
1854 {
1855 aio_workq_entry *entryp;
1856 int count;
1857
1858 /* begin with count of completed async IO requests for this process */
1859 count = procp->aio_done_count;
1860
1861 /* add in count of active async IO requests for this process */
1862 count += procp->aio_active_count;
1863
1864 /* look for matches on our queue of asynchronous todo work */
1865 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1866 if ( procp == entryp->procp ) {
1867 count++;
1868 }
1869 }
1870
1871 /* look for matches on our queue of synchronous todo work */
1872 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1873 if ( procp == entryp->procp ) {
1874 count++;
1875 }
1876 }
1877
1878 return( count );
1879
1880 } /* aio_get_process_count */
1881
1882
1883 /*
1884 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1885 * NOTE - caller must hold aio lock!
1886 */
1887
1888 static int
1889 aio_get_all_queues_count( void )
1890 {
1891 int count;
1892
1893 count = aio_anchor.aio_async_workq_count;
1894 count += aio_anchor.lio_sync_workq_count;
1895 count += aio_anchor.aio_active_count;
1896 count += aio_anchor.aio_done_count;
1897
1898 return( count );
1899
1900 } /* aio_get_all_queues_count */
1901
1902
1903 /*
1904 * do_aio_completion. Handle async IO completion.
1905 */
1906
1907 static void
1908 do_aio_completion( aio_workq_entry *entryp )
1909 {
1910 /* signal user land process if appropriate */
1911 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1912 (entryp->flags & AIO_DISABLE) == 0 ) {
1913
1914 /*
1915 * if group_tag is non zero then make sure this is the last IO request
1916 * in the group before we signal.
1917 */
1918 if ( entryp->group_tag == 0 ||
1919 (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1920 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1921 (int)entryp->procp, (int)entryp->uaiocbp,
1922 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1923
1924 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1925 return;
1926 }
1927 }
1928
1929 /*
1930 * need to handle case where a process is trying to exit, exec, or close
1931 * and is currently waiting for active aio requests to complete. If
1932 * AIO_WAITING is set then we need to look to see if there are any
1933 * other requests in the active queue for this process. If there are
1934 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1935 * there are some still active then do nothing - we only want to wakeup
1936 * when all active aio requests for the process are complete.
1937 */
1938 if ( (entryp->flags & AIO_WAITING) != 0 ) {
1939 int active_requests;
1940
1941 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1942 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1943
1944 AIO_LOCK;
1945 active_requests = aio_active_requests_for_process( entryp->procp );
1946 //AIO_UNLOCK;
1947 if ( active_requests < 1 ) {
1948 /* no active aio requests for this process, continue exiting */
1949 wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1950
1951 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1952 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1953 }
1954 AIO_UNLOCK;
1955 return;
1956 }
1957
1958 /*
1959 * aio_suspend case when a signal was not requested. In that scenario we
1960 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1961 * NOTE - the assumption here is that this wakeup call is inexpensive.
1962 * we really only need to do this when an aio_suspend call is pending.
1963 * If we find the wakeup call should be avoided we could mark the
1964 * async IO requests given in the list provided by aio_suspend and only
1965 * call wakeup for them. If we do mark them we should unmark them after
1966 * the aio_suspend wakes up.
1967 */
1968 AIO_LOCK;
1969 wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1970 AIO_UNLOCK;
1971
1972 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1973 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1974
1975 return;
1976
1977 } /* do_aio_completion */
1978
1979
1980 /*
1981 * aio_last_group_io - checks to see if this is the last unfinished IO request
1982 * for the given group_tag. Returns TRUE if there are no other active IO
1983 * requests for this group or FALSE if the are active IO requests
1984 * NOTE - AIO_LOCK must be held by caller
1985 */
1986
1987 static boolean_t
1988 aio_last_group_io( aio_workq_entry *entryp )
1989 {
1990 aio_workq_entry *my_entryp;
1991
1992 /* look for matches on our queue of active async IO requests */
1993 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1994 if ( my_entryp->group_tag == entryp->group_tag )
1995 return( FALSE );
1996 }
1997
1998 /* look for matches on our queue of asynchronous todo work */
1999 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2000 if ( my_entryp->group_tag == entryp->group_tag )
2001 return( FALSE );
2002 }
2003
2004 /* look for matches on our queue of synchronous todo work */
2005 TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2006 if ( my_entryp->group_tag == entryp->group_tag )
2007 return( FALSE );
2008 }
2009
2010 return( TRUE );
2011
2012 } /* aio_last_group_io */
2013
2014
2015 /*
2016 * do_aio_read
2017 */
2018 static int
2019 do_aio_read( aio_workq_entry *entryp )
2020 {
2021 struct fileproc *fp;
2022 int error;
2023
2024 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2025 return(error);
2026 if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2027 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2028 return(EBADF);
2029 }
2030 if ( fp != NULL ) {
2031 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
2032 entryp->aiocb.aio_buf,
2033 entryp->aiocb.aio_nbytes,
2034 entryp->aiocb.aio_offset, FOF_OFFSET,
2035 &entryp->returnval );
2036 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2037 }
2038 else {
2039 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2040 error = EBADF;
2041 }
2042
2043 return( error );
2044
2045 } /* do_aio_read */
2046
2047
2048 /*
2049 * do_aio_write
2050 */
2051 static int
2052 do_aio_write( aio_workq_entry *entryp )
2053 {
2054 struct fileproc *fp;
2055 int error;
2056
2057 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2058 return(error);
2059 if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2060 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2061 return(EBADF);
2062 }
2063 if ( fp != NULL ) {
2064 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes,
2065 entryp->aiocb.aio_buf,
2066 entryp->aiocb.aio_nbytes,
2067 entryp->aiocb.aio_offset, FOF_OFFSET,
2068 &entryp->returnval );
2069
2070 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2071 }
2072 else {
2073 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2074 error = EBADF;
2075 }
2076
2077 return( error );
2078
2079 } /* do_aio_write */
2080
2081
2082 /*
2083 * aio_active_requests_for_process - return number of active async IO
2084 * requests for the given process.
2085 * NOTE - caller must hold aio lock!
2086 */
2087
2088 static int
2089 aio_active_requests_for_process( struct proc *procp )
2090 {
2091
2092 return( procp->aio_active_count );
2093
2094 } /* aio_active_requests_for_process */
2095
2096
2097 /*
2098 * do_aio_fsync
2099 */
2100 static int
2101 do_aio_fsync( aio_workq_entry *entryp )
2102 {
2103 struct vfs_context context;
2104 struct vnode *vp;
2105 struct fileproc *fp;
2106 int error;
2107
2108 /*
2109 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2110 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2111 * The following was shamelessly extracted from fsync() implementation.
2112 */
2113
2114 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2115 if ( error == 0 ) {
2116 if ( (error = vnode_getwithref(vp)) ) {
2117 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2118 entryp->returnval = -1;
2119 return(error);
2120 }
2121 context.vc_proc = entryp->procp;
2122 context.vc_ucred = fp->f_fglob->fg_cred;
2123
2124 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2125
2126 (void)vnode_put(vp);
2127
2128 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2129 }
2130 if ( error != 0 )
2131 entryp->returnval = -1;
2132
2133 return( error );
2134
2135 } /* do_aio_fsync */
2136
2137
2138 /*
2139 * is_already_queued - runs through our queues to see if the given
2140 * aiocbp / process is there. Returns TRUE if there is a match
2141 * on any of our aio queues.
2142 * NOTE - callers must hold aio lock!
2143 */
2144
2145 static boolean_t
2146 is_already_queued( struct proc *procp,
2147 user_addr_t aiocbp )
2148 {
2149 aio_workq_entry *entryp;
2150 boolean_t result;
2151
2152 result = FALSE;
2153
2154 /* look for matches on our queue of async IO requests that have completed */
2155 TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2156 if ( aiocbp == entryp->uaiocbp ) {
2157 result = TRUE;
2158 goto ExitThisRoutine;
2159 }
2160 }
2161
2162 /* look for matches on our queue of active async IO requests */
2163 TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2164 if ( aiocbp == entryp->uaiocbp ) {
2165 result = TRUE;
2166 goto ExitThisRoutine;
2167 }
2168 }
2169
2170 /* look for matches on our queue of asynchronous todo work */
2171 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2172 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2173 result = TRUE;
2174 goto ExitThisRoutine;
2175 }
2176 }
2177
2178 /* look for matches on our queue of synchronous todo work */
2179 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2180 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2181 result = TRUE;
2182 goto ExitThisRoutine;
2183 }
2184 }
2185
2186 ExitThisRoutine:
2187 return( result );
2188
2189 } /* is_already_queued */
2190
2191
2192 /*
2193 * aio initialization
2194 */
2195 __private_extern__ void
2196 aio_init( void )
2197 {
2198 int i;
2199
2200 aio_lock_grp_attr = lck_grp_attr_alloc_init();
2201 lck_grp_attr_setstat(aio_lock_grp_attr);
2202 aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2203 aio_lock_attr = lck_attr_alloc_init();
2204 //lck_attr_setdebug(aio_lock_attr);
2205
2206 aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2207
2208 AIO_LOCK;
2209 TAILQ_INIT( &aio_anchor.aio_async_workq );
2210 TAILQ_INIT( &aio_anchor.lio_sync_workq );
2211 aio_anchor.aio_async_workq_count = 0;
2212 aio_anchor.lio_sync_workq_count = 0;
2213 aio_anchor.aio_active_count = 0;
2214 aio_anchor.aio_done_count = 0;
2215 AIO_UNLOCK;
2216
2217 i = sizeof( aio_workq_entry );
2218 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2219
2220 _aio_create_worker_threads( aio_worker_threads );
2221
2222 return;
2223
2224 } /* aio_init */
2225
2226
2227 /*
2228 * aio worker threads created here.
2229 */
2230 __private_extern__ void
2231 _aio_create_worker_threads( int num )
2232 {
2233 int i;
2234
2235 /* create some worker threads to handle the async IO requests */
2236 for ( i = 0; i < num; i++ ) {
2237 thread_t myThread;
2238
2239 myThread = kernel_thread( kernel_task, aio_work_thread );
2240 if ( THREAD_NULL == myThread ) {
2241 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2242 }
2243 }
2244
2245 return;
2246
2247 } /* _aio_create_worker_threads */
2248
2249 /*
2250 * Return the current activation utask
2251 */
2252 task_t
2253 get_aiotask(void)
2254 {
2255 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2256 }
2257
2258
2259 /*
2260 * In the case of an aiocb from a
2261 * 32-bit process we need to expand some longs and pointers to the correct
2262 * sizes in order to let downstream code always work on the same type of
2263 * aiocb (in our case that is a user_aiocb)
2264 */
2265 static void
2266 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2267 {
2268 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2269 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2270 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2271 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2272 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2273 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2274
2275 /* special case here. since we do not know if sigev_value is an */
2276 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2277 /* means if we send this info back to user space we need to remember */
2278 /* sigev_value was not expanded for the 32-bit case. */
2279 /* NOTE - this does NOT affect us since we don't support sigev_value */
2280 /* yet in the aio context. */
2281 //LP64
2282 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2283 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2284 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2285 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2286 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2287 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2288 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2289 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2290 }