]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_aio.c
xnu-792.13.8.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
1 /*
2 * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30
31
32 /*
33 * todo:
34 * 1) ramesh is looking into how to replace taking a reference on
35 * the user's map (vm_map_reference()) since it is believed that
36 * would not hold the process for us.
37 * 2) david is looking into a way for us to set the priority of the
38 * worker threads to match that of the user's thread when the
39 * async IO was queued.
40 */
41
42
43 /*
44 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
45 */
46
47 #include <sys/systm.h>
48 #include <sys/fcntl.h>
49 #include <sys/file_internal.h>
50 #include <sys/filedesc.h>
51 #include <sys/kernel.h>
52 #include <sys/vnode_internal.h>
53 #include <sys/malloc.h>
54 #include <sys/mount_internal.h>
55 #include <sys/param.h>
56 #include <sys/proc_internal.h>
57 #include <sys/sysctl.h>
58 #include <sys/unistd.h>
59 #include <sys/user.h>
60
61 #include <sys/aio_kern.h>
62 #include <sys/sysproto.h>
63
64 #include <machine/limits.h>
65
66 #include <mach/mach_types.h>
67 #include <kern/kern_types.h>
68 #include <kern/zalloc.h>
69 #include <kern/task.h>
70 #include <kern/sched_prim.h>
71
72 #include <vm/vm_map.h>
73
74 #include <sys/kdebug.h>
75 #define AIO_work_queued 1
76 #define AIO_worker_wake 2
77 #define AIO_completion_sig 3
78 #define AIO_completion_cleanup_wait 4
79 #define AIO_completion_cleanup_wake 5
80 #define AIO_completion_suspend_wake 6
81 #define AIO_fsync_delay 7
82 #define AIO_cancel 10
83 #define AIO_cancel_async_workq 11
84 #define AIO_cancel_sync_workq 12
85 #define AIO_cancel_activeq 13
86 #define AIO_cancel_doneq 14
87 #define AIO_fsync 20
88 #define AIO_read 30
89 #define AIO_write 40
90 #define AIO_listio 50
91 #define AIO_error 60
92 #define AIO_error_val 61
93 #define AIO_error_activeq 62
94 #define AIO_error_workq 63
95 #define AIO_return 70
96 #define AIO_return_val 71
97 #define AIO_return_activeq 72
98 #define AIO_return_workq 73
99 #define AIO_exec 80
100 #define AIO_exit 90
101 #define AIO_exit_sleep 91
102 #define AIO_close 100
103 #define AIO_close_sleep 101
104 #define AIO_suspend 110
105 #define AIO_suspend_sleep 111
106 #define AIO_worker_thread 120
107
108 #if 0
109 #undef KERNEL_DEBUG
110 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
111 #endif
112
113 /*
114 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
115 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
116 * (proc.aio_activeq) when one of our worker threads start the IO.
117 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
118 * when the IO request completes. The request remains on aio_doneq until
119 * user process calls aio_return or the process exits, either way that is our
120 * trigger to release aio resources.
121 */
122 struct aio_anchor_cb
123 {
124 int aio_async_workq_count; /* entries on aio_async_workq */
125 int lio_sync_workq_count; /* entries on lio_sync_workq */
126 int aio_active_count; /* entries on all active queues (proc.aio_activeq) */
127 int aio_done_count; /* entries on all done queues (proc.aio_doneq) */
128 TAILQ_HEAD( , aio_workq_entry ) aio_async_workq;
129 TAILQ_HEAD( , aio_workq_entry ) lio_sync_workq;
130 };
131 typedef struct aio_anchor_cb aio_anchor_cb;
132
133
134 /*
135 * Notes on aio sleep / wake channels.
136 * We currently pick a couple fields within the proc structure that will allow
137 * us sleep channels that currently do not collide with any other kernel routines.
138 * At this time, for binary compatibility reasons, we cannot create new proc fields.
139 */
140 #define AIO_SUSPEND_SLEEP_CHAN p_estcpu
141 #define AIO_CLEANUP_SLEEP_CHAN p_pctcpu
142
143
144 /*
145 * aysnc IO locking macros used to protect critical sections.
146 */
147 #define AIO_LOCK lck_mtx_lock(aio_lock)
148 #define AIO_UNLOCK lck_mtx_unlock(aio_lock)
149
150
151 /*
152 * LOCAL PROTOTYPES
153 */
154 static int aio_active_requests_for_process( struct proc *procp );
155 static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
156 static int aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
157 static int aio_get_all_queues_count( void );
158 static int aio_get_process_count( struct proc *procp );
159 static aio_workq_entry * aio_get_some_work( void );
160 static boolean_t aio_last_group_io( aio_workq_entry *entryp );
161 static void aio_mark_requests( aio_workq_entry *entryp );
162 static int aio_queue_async_request( struct proc *procp,
163 user_addr_t aiocbp,
164 int kindOfIO );
165 static int aio_validate( aio_workq_entry *entryp );
166 static void aio_work_thread( void );
167 static int do_aio_cancel( struct proc *p,
168 int fd,
169 user_addr_t aiocbp,
170 boolean_t wait_for_completion,
171 boolean_t disable_notification );
172 static void do_aio_completion( aio_workq_entry *entryp );
173 static int do_aio_fsync( aio_workq_entry *entryp );
174 static int do_aio_read( aio_workq_entry *entryp );
175 static int do_aio_write( aio_workq_entry *entryp );
176 static void do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
177 static boolean_t is_already_queued( struct proc *procp,
178 user_addr_t aiocbp );
179 static int lio_create_async_entry( struct proc *procp,
180 user_addr_t aiocbp,
181 user_addr_t sigp,
182 long group_tag,
183 aio_workq_entry **entrypp );
184 static int lio_create_sync_entry( struct proc *procp,
185 user_addr_t aiocbp,
186 long group_tag,
187 aio_workq_entry **entrypp );
188
189
190 /*
191 * EXTERNAL PROTOTYPES
192 */
193
194 /* in ...bsd/kern/sys_generic.c */
195 extern int dofileread( struct proc *p, struct fileproc *fp, int fd,
196 user_addr_t bufp, user_size_t nbyte,
197 off_t offset, int flags, user_ssize_t *retval );
198 extern int dofilewrite( struct proc *p, struct fileproc *fp, int fd,
199 user_addr_t bufp, user_size_t nbyte, off_t offset,
200 int flags, user_ssize_t *retval );
201
202 /*
203 * aio external global variables.
204 */
205 extern int aio_max_requests; /* AIO_MAX - configurable */
206 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
207 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
208
209
210 /*
211 * aio static variables.
212 */
213 static aio_anchor_cb aio_anchor;
214 static lck_mtx_t * aio_lock;
215 static lck_grp_t * aio_lock_grp;
216 static lck_attr_t * aio_lock_attr;
217 static lck_grp_attr_t * aio_lock_grp_attr;
218 static struct zone *aio_workq_zonep;
219
220
221
222
223 /*
224 * aio_cancel - attempt to cancel one or more async IO requests currently
225 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
226 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
227 * is NULL then all outstanding async IO request for the given file
228 * descriptor are cancelled (if possible).
229 */
230
231 int
232 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
233 {
234 struct user_aiocb my_aiocb;
235 int result;
236
237 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
238 (int)p, (int)uap->aiocbp, 0, 0, 0 );
239
240 /* quick check to see if there are any async IO requests queued up */
241 AIO_LOCK;
242 result = aio_get_all_queues_count( );
243 AIO_UNLOCK;
244 if ( result < 1 ) {
245 result = EBADF;
246 goto ExitRoutine;
247 }
248
249 *retval = -1;
250 if ( uap->aiocbp != USER_ADDR_NULL ) {
251 if ( !IS_64BIT_PROCESS(p) ) {
252 struct aiocb aiocb32;
253
254 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
255 if ( result == 0 )
256 do_munge_aiocb( &aiocb32, &my_aiocb );
257 } else
258 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
259
260 if ( result != 0 ) {
261 result = EAGAIN;
262 goto ExitRoutine;
263 }
264
265 /* NOTE - POSIX standard says a mismatch between the file */
266 /* descriptor passed in and the file descriptor embedded in */
267 /* the aiocb causes unspecified results. We return EBADF in */
268 /* that situation. */
269 if ( uap->fd != my_aiocb.aio_fildes ) {
270 result = EBADF;
271 goto ExitRoutine;
272 }
273 }
274 result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
275
276 if ( result != -1 ) {
277 *retval = result;
278 result = 0;
279 goto ExitRoutine;
280 }
281
282 result = EBADF;
283
284 ExitRoutine:
285 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
286 (int)p, (int)uap->aiocbp, result, 0, 0 );
287
288 return( result );
289
290 } /* aio_cancel */
291
292
293 /*
294 * _aio_close - internal function used to clean up async IO requests for
295 * a file descriptor that is closing.
296 * THIS MAY BLOCK.
297 */
298
299 __private_extern__ void
300 _aio_close( struct proc *p, int fd )
301 {
302 int error, count;
303
304 /* quick check to see if there are any async IO requests queued up */
305 AIO_LOCK;
306 count = aio_get_all_queues_count( );
307 AIO_UNLOCK;
308 if ( count < 1 )
309 return;
310
311 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
312 (int)p, fd, 0, 0, 0 );
313
314 /* cancel all async IO requests on our todo queues for this file descriptor */
315 error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
316 if ( error == AIO_NOTCANCELED ) {
317 /*
318 * AIO_NOTCANCELED is returned when we find an aio request for this process
319 * and file descriptor on the active async IO queue. Active requests cannot
320 * be cancelled so we must wait for them to complete. We will get a special
321 * wake up call on our channel used to sleep for ALL active requests to
322 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
323 * when we must wait for all active aio requests.
324 */
325
326 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
327 (int)p, fd, 0, 0, 0 );
328
329 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
330 }
331
332 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
333 (int)p, fd, 0, 0, 0 );
334
335 return;
336
337 } /* _aio_close */
338
339
340 /*
341 * aio_error - return the error status associated with the async IO
342 * request referred to by uap->aiocbp. The error status is the errno
343 * value that would be set by the corresponding IO request (read, wrtie,
344 * fdatasync, or sync).
345 */
346
347 int
348 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
349 {
350 aio_workq_entry *entryp;
351 int error;
352
353 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
354 (int)p, (int)uap->aiocbp, 0, 0, 0 );
355
356 AIO_LOCK;
357
358 /* quick check to see if there are any async IO requests queued up */
359 if ( aio_get_all_queues_count( ) < 1 ) {
360 error = EINVAL;
361 goto ExitRoutine;
362 }
363
364 /* look for a match on our queue of async IO requests that have completed */
365 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
366 if ( entryp->uaiocbp == uap->aiocbp ) {
367 *retval = entryp->errorval;
368 error = 0;
369 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
370 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
371 goto ExitRoutine;
372 }
373 }
374
375 /* look for a match on our queue of active async IO requests */
376 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
377 if ( entryp->uaiocbp == uap->aiocbp ) {
378 *retval = EINPROGRESS;
379 error = 0;
380 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
381 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
382 goto ExitRoutine;
383 }
384 }
385
386 /* look for a match on our queue of todo work */
387 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
388 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
389 *retval = EINPROGRESS;
390 error = 0;
391 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
392 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
393 goto ExitRoutine;
394 }
395 }
396 error = EINVAL;
397
398 ExitRoutine:
399 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
400 (int)p, (int)uap->aiocbp, error, 0, 0 );
401 AIO_UNLOCK;
402
403 return( error );
404
405 } /* aio_error */
406
407
408 /*
409 * aio_fsync - asynchronously force all IO operations associated
410 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
411 * queued at the time of the call to the synchronized completion state.
412 * NOTE - we do not support op O_DSYNC at this point since we do not support the
413 * fdatasync() call.
414 */
415
416 int
417 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
418 {
419 int error;
420 int fsync_kind;
421
422 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
423 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
424
425 *retval = 0;
426 /* 0 := O_SYNC for binary backward compatibility with Panther */
427 if (uap->op == O_SYNC || uap->op == 0)
428 fsync_kind = AIO_FSYNC;
429 #if 0 // we don't support fdatasync() call yet
430 else if ( uap->op == O_DSYNC )
431 fsync_kind = AIO_DSYNC;
432 #endif
433 else {
434 *retval = -1;
435 error = EINVAL;
436 goto ExitRoutine;
437 }
438
439 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
440 if ( error != 0 )
441 *retval = -1;
442
443 ExitRoutine:
444 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
445 (int)p, (int)uap->aiocbp, error, 0, 0 );
446
447 return( error );
448
449 } /* aio_fsync */
450
451
452 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
453 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
454 * (uap->aiocbp->aio_buf).
455 */
456
457 int
458 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
459 {
460 int error;
461
462 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
463 (int)p, (int)uap->aiocbp, 0, 0, 0 );
464
465 *retval = 0;
466
467 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
468 if ( error != 0 )
469 *retval = -1;
470
471 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
472 (int)p, (int)uap->aiocbp, error, 0, 0 );
473
474 return( error );
475
476 } /* aio_read */
477
478
479 /*
480 * aio_return - return the return status associated with the async IO
481 * request referred to by uap->aiocbp. The return status is the value
482 * that would be returned by corresponding IO request (read, wrtie,
483 * fdatasync, or sync). This is where we release kernel resources
484 * held for async IO call associated with the given aiocb pointer.
485 */
486
487 int
488 aio_return( struct proc *p, struct aio_return_args *uap, user_ssize_t *retval )
489 {
490 aio_workq_entry *entryp;
491 int error;
492 boolean_t lock_held;
493
494 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
495 (int)p, (int)uap->aiocbp, 0, 0, 0 );
496
497 AIO_LOCK;
498 lock_held = TRUE;
499 *retval = 0;
500
501 /* quick check to see if there are any async IO requests queued up */
502 if ( aio_get_all_queues_count( ) < 1 ) {
503 error = EINVAL;
504 goto ExitRoutine;
505 }
506
507 /* look for a match on our queue of async IO requests that have completed */
508 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
509 if ( entryp->uaiocbp == uap->aiocbp ) {
510 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
511 aio_anchor.aio_done_count--;
512 p->aio_done_count--;
513
514 *retval = entryp->returnval;
515
516 /* we cannot free requests that are still completing */
517 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
518 vm_map_t my_map;
519
520 my_map = entryp->aio_map;
521 entryp->aio_map = VM_MAP_NULL;
522 AIO_UNLOCK;
523 lock_held = FALSE;
524 aio_free_request( entryp, my_map );
525 }
526 else
527 /* tell completion code to free this request */
528 entryp->flags |= AIO_DO_FREE;
529 error = 0;
530 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
531 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
532 goto ExitRoutine;
533 }
534 }
535
536 /* look for a match on our queue of active async IO requests */
537 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
538 if ( entryp->uaiocbp == uap->aiocbp ) {
539 error = EINPROGRESS;
540 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
541 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
542 goto ExitRoutine;
543 }
544 }
545
546 /* look for a match on our queue of todo work */
547 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
548 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
549 error = EINPROGRESS;
550 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
551 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
552 goto ExitRoutine;
553 }
554 }
555 error = EINVAL;
556
557 ExitRoutine:
558 if ( lock_held )
559 AIO_UNLOCK;
560 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
561 (int)p, (int)uap->aiocbp, error, 0, 0 );
562
563 return( error );
564
565 } /* aio_return */
566
567
568 /*
569 * _aio_exec - internal function used to clean up async IO requests for
570 * a process that is going away due to exec(). We cancel any async IOs
571 * we can and wait for those already active. We also disable signaling
572 * for cancelled or active aio requests that complete.
573 * This routine MAY block!
574 */
575
576 __private_extern__ void
577 _aio_exec( struct proc *p )
578 {
579
580 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
581 (int)p, 0, 0, 0, 0 );
582
583 _aio_exit( p );
584
585 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
586 (int)p, 0, 0, 0, 0 );
587
588 return;
589
590 } /* _aio_exec */
591
592
593 /*
594 * _aio_exit - internal function used to clean up async IO requests for
595 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
596 * we can and wait for those already active. We also disable signaling
597 * for cancelled or active aio requests that complete. This routine MAY block!
598 */
599
600 __private_extern__ void
601 _aio_exit( struct proc *p )
602 {
603 int error, count;
604 aio_workq_entry *entryp;
605
606 /* quick check to see if there are any async IO requests queued up */
607 AIO_LOCK;
608 count = aio_get_all_queues_count( );
609 AIO_UNLOCK;
610 if ( count < 1 ) {
611 return;
612 }
613
614 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
615 (int)p, 0, 0, 0, 0 );
616
617 /*
618 * cancel async IO requests on the todo work queue and wait for those
619 * already active to complete.
620 */
621 error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
622 if ( error == AIO_NOTCANCELED ) {
623 /*
624 * AIO_NOTCANCELED is returned when we find an aio request for this process
625 * on the active async IO queue. Active requests cannot be cancelled so we
626 * must wait for them to complete. We will get a special wake up call on
627 * our channel used to sleep for ALL active requests to complete. This sleep
628 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
629 * active aio requests.
630 */
631
632 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
633 (int)p, 0, 0, 0, 0 );
634
635 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
636 }
637
638 /* release all aio resources used by this process */
639 AIO_LOCK;
640 entryp = TAILQ_FIRST( &p->aio_doneq );
641 while ( entryp != NULL ) {
642 aio_workq_entry *next_entryp;
643
644 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
645 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
646 aio_anchor.aio_done_count--;
647 p->aio_done_count--;
648
649 /* we cannot free requests that are still completing */
650 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
651 vm_map_t my_map;
652
653 my_map = entryp->aio_map;
654 entryp->aio_map = VM_MAP_NULL;
655 AIO_UNLOCK;
656 aio_free_request( entryp, my_map );
657
658 /* need to start over since aio_doneq may have been */
659 /* changed while we were away. */
660 AIO_LOCK;
661 entryp = TAILQ_FIRST( &p->aio_doneq );
662 continue;
663 }
664 else
665 /* tell completion code to free this request */
666 entryp->flags |= AIO_DO_FREE;
667 entryp = next_entryp;
668 }
669 AIO_UNLOCK;
670
671 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
672 (int)p, 0, 0, 0, 0 );
673
674 return;
675
676 } /* _aio_exit */
677
678
679 /*
680 * do_aio_cancel - cancel async IO requests (if possible). We get called by
681 * aio_cancel, close, and at exit.
682 * There are three modes of operation: 1) cancel all async IOs for a process -
683 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
684 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
685 * aiocbp.
686 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
687 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
688 * target async IO requests, and AIO_ALLDONE if all target async IO requests
689 * were already complete.
690 * WARNING - do not deference aiocbp in this routine, it may point to user
691 * land data that has not been copied in (when called from aio_cancel() )
692 */
693
694 static int
695 do_aio_cancel( struct proc *p, int fd, user_addr_t aiocbp,
696 boolean_t wait_for_completion, boolean_t disable_notification )
697 {
698 aio_workq_entry *entryp;
699 int result;
700
701 result = -1;
702
703 /* look for a match on our queue of async todo work. */
704 AIO_LOCK;
705 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
706 while ( entryp != NULL ) {
707 aio_workq_entry *next_entryp;
708
709 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
710 if ( p == entryp->procp ) {
711 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
712 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
713 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
714 /* we found a match so we remove the entry from the */
715 /* todo work queue and place it on the done queue */
716 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
717 aio_anchor.aio_async_workq_count--;
718 entryp->errorval = ECANCELED;
719 entryp->returnval = -1;
720 if ( disable_notification )
721 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
722 result = AIO_CANCELED;
723
724 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
725 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
726
727 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
728 aio_anchor.aio_done_count++;
729 p->aio_done_count++;
730 entryp->flags |= AIO_COMPLETION;
731 AIO_UNLOCK;
732
733 /* do completion processing for this request */
734 do_aio_completion( entryp );
735
736 AIO_LOCK;
737 entryp->flags &= ~AIO_COMPLETION;
738 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
739 vm_map_t my_map;
740
741 my_map = entryp->aio_map;
742 entryp->aio_map = VM_MAP_NULL;
743 AIO_UNLOCK;
744 aio_free_request( entryp, my_map );
745 }
746 else
747 AIO_UNLOCK;
748
749 if ( aiocbp != USER_ADDR_NULL ) {
750 return( result );
751 }
752
753 /* need to start over since aio_async_workq may have been */
754 /* changed while we were away doing completion processing. */
755 AIO_LOCK;
756 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
757 continue;
758 }
759 }
760 entryp = next_entryp;
761 } /* while... */
762
763 /*
764 * look for a match on our queue of synchronous todo work. This will
765 * be a rare occurrence but could happen if a process is terminated while
766 * processing a lio_listio call.
767 */
768 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
769 while ( entryp != NULL ) {
770 aio_workq_entry *next_entryp;
771
772 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
773 if ( p == entryp->procp ) {
774 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
775 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
776 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
777 /* we found a match so we remove the entry from the */
778 /* todo work queue and place it on the done queue */
779 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
780 aio_anchor.lio_sync_workq_count--;
781 entryp->errorval = ECANCELED;
782 entryp->returnval = -1;
783 if ( disable_notification )
784 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
785 result = AIO_CANCELED;
786
787 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
788 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
789
790 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
791 aio_anchor.aio_done_count++;
792 p->aio_done_count++;
793 if ( aiocbp != USER_ADDR_NULL ) {
794 AIO_UNLOCK;
795 return( result );
796 }
797 }
798 }
799 entryp = next_entryp;
800 } /* while... */
801
802 /*
803 * look for a match on our queue of active async IO requests and
804 * return AIO_NOTCANCELED result.
805 */
806 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
807 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
808 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
809 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
810 result = AIO_NOTCANCELED;
811
812 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
813 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
814
815 if ( wait_for_completion )
816 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
817 if ( disable_notification )
818 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
819 if ( aiocbp != USER_ADDR_NULL ) {
820 AIO_UNLOCK;
821 return( result );
822 }
823 }
824 }
825
826 /*
827 * if we didn't find any matches on the todo or active queues then look for a
828 * match on our queue of async IO requests that have completed and if found
829 * return AIO_ALLDONE result.
830 */
831 if ( result == -1 ) {
832 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
833 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
834 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
835 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
836 result = AIO_ALLDONE;
837
838 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
839 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
840
841 if ( aiocbp != USER_ADDR_NULL ) {
842 AIO_UNLOCK;
843 return( result );
844 }
845 }
846 }
847 }
848 AIO_UNLOCK;
849
850 return( result );
851
852 } /* do_aio_cancel */
853
854
855 /*
856 * aio_suspend - suspend the calling thread until at least one of the async
857 * IO operations referenced by uap->aiocblist has completed, until a signal
858 * interrupts the function, or uap->timeoutp time interval (optional) has
859 * passed.
860 * Returns 0 if one or more async IOs have completed else -1 and errno is
861 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
862 * woke us up.
863 */
864
865 int
866 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
867 {
868 int error;
869 int i, count;
870 uint64_t abstime;
871 struct user_timespec ts;
872 aio_workq_entry *entryp;
873 user_addr_t *aiocbpp;
874
875 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
876 (int)p, uap->nent, 0, 0, 0 );
877
878 *retval = -1;
879 abstime = 0;
880 aiocbpp = NULL;
881
882 /* quick check to see if there are any async IO requests queued up */
883 AIO_LOCK;
884 count = aio_get_all_queues_count( );
885 AIO_UNLOCK;
886 if ( count < 1 ) {
887 error = EINVAL;
888 goto ExitThisRoutine;
889 }
890
891 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
892 error = EINVAL;
893 goto ExitThisRoutine;
894 }
895
896 if ( uap->timeoutp != USER_ADDR_NULL ) {
897 if ( proc_is64bit(p) ) {
898 error = copyin( uap->timeoutp, &ts, sizeof(ts) );
899 }
900 else {
901 struct timespec temp;
902 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
903 if ( error == 0 ) {
904 ts.tv_sec = temp.tv_sec;
905 ts.tv_nsec = temp.tv_nsec;
906 }
907 }
908 if ( error != 0 ) {
909 error = EAGAIN;
910 goto ExitThisRoutine;
911 }
912
913 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
914 error = EINVAL;
915 goto ExitThisRoutine;
916 }
917
918 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
919 &abstime );
920 clock_absolutetime_interval_to_deadline( abstime, &abstime );
921 }
922
923 /* we reserve enough space for largest possible pointer size */
924 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
925 if ( aiocbpp == NULL ) {
926 error = EAGAIN;
927 goto ExitThisRoutine;
928 }
929
930 /* copyin our aiocb pointers from list */
931 error = copyin( uap->aiocblist, aiocbpp,
932 proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
933 : (uap->nent * sizeof(uintptr_t)) );
934 if ( error != 0 ) {
935 error = EAGAIN;
936 goto ExitThisRoutine;
937 }
938
939 /* we depend on a list of user_addr_t's so we need to munge and expand */
940 /* when these pointers came from a 32-bit process */
941 if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
942 /* position to the last entry and work back from there */
943 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
944 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
945 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
946 *my_addrp = (user_addr_t) (*my_ptrp);
947 }
948 }
949
950 /* check list of aio requests to see if any have completed */
951 AIO_LOCK;
952 for ( i = 0; i < uap->nent; i++ ) {
953 user_addr_t aiocbp;
954
955 /* NULL elements are legal so check for 'em */
956 aiocbp = *(aiocbpp + i);
957 if ( aiocbp == USER_ADDR_NULL )
958 continue;
959
960 /* return immediately if any aio request in the list is done */
961 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
962 if ( entryp->uaiocbp == aiocbp ) {
963 *retval = 0;
964 error = 0;
965 AIO_UNLOCK;
966 goto ExitThisRoutine;
967 }
968 }
969 } /* for ( ; i < uap->nent; ) */
970
971 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
972 (int)p, uap->nent, 0, 0, 0 );
973
974 /*
975 * wait for an async IO to complete or a signal fires or timeout expires.
976 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
977 * interrupts us. If an async IO completes before a signal fires or our
978 * timeout expires, we get a wakeup call from aio_work_thread().
979 */
980 assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
981 AIO_UNLOCK;
982
983 error = thread_block( THREAD_CONTINUE_NULL );
984
985 if ( error == THREAD_AWAKENED ) {
986 /* got our wakeup call from aio_work_thread() */
987 *retval = 0;
988 error = 0;
989 }
990 else if ( error == THREAD_TIMED_OUT ) {
991 /* our timeout expired */
992 error = EAGAIN;
993 }
994 else {
995 /* we were interrupted */
996 error = EINTR;
997 }
998
999 ExitThisRoutine:
1000 if ( aiocbpp != NULL )
1001 FREE( aiocbpp, M_TEMP );
1002
1003 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1004 (int)p, uap->nent, error, 0, 0 );
1005
1006 return( error );
1007
1008 } /* aio_suspend */
1009
1010
1011 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1012 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1013 * (uap->aiocbp->aio_buf).
1014 */
1015
1016 int
1017 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1018 {
1019 int error;
1020
1021 *retval = 0;
1022
1023 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1024 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1025
1026 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1027 if ( error != 0 )
1028 *retval = -1;
1029
1030 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1031 (int)p, (int)uap->aiocbp, error, 0, 0 );
1032
1033 return( error );
1034
1035 } /* aio_write */
1036
1037
1038 /*
1039 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1040 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1041 * The caller gets error and return status for each aiocb in the list via aio_error
1042 * and aio_return. We must keep completed requests until released by the
1043 * aio_return call.
1044 */
1045
1046 int
1047 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1048 {
1049 int i;
1050 int call_result;
1051 int result;
1052 long group_tag;
1053 aio_workq_entry * *entryp_listp;
1054 user_addr_t *aiocbpp;
1055
1056 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1057 (int)p, uap->nent, uap->mode, 0, 0 );
1058
1059 entryp_listp = NULL;
1060 aiocbpp = NULL;
1061 call_result = -1;
1062 *retval = -1;
1063 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1064 call_result = EINVAL;
1065 goto ExitRoutine;
1066 }
1067
1068 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1069 call_result = EINVAL;
1070 goto ExitRoutine;
1071 }
1072
1073 /*
1074 * we use group_tag to mark IO requests for delayed completion processing
1075 * which means we wait until all IO requests in the group have completed
1076 * before we either return to the caller when mode is LIO_WAIT or signal
1077 * user when mode is LIO_NOWAIT.
1078 */
1079 group_tag = random();
1080
1081 /*
1082 * allocate a list of aio_workq_entry pointers that we will use to queue
1083 * up all our requests at once while holding our lock.
1084 */
1085 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1086 if ( entryp_listp == NULL ) {
1087 call_result = EAGAIN;
1088 goto ExitRoutine;
1089 }
1090
1091 /* we reserve enough space for largest possible pointer size */
1092 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1093 if ( aiocbpp == NULL ) {
1094 call_result = EAGAIN;
1095 goto ExitRoutine;
1096 }
1097
1098 /* copyin our aiocb pointers from list */
1099 result = copyin( uap->aiocblist, aiocbpp,
1100 IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1101 : (uap->nent * sizeof(uintptr_t)) );
1102 if ( result != 0 ) {
1103 call_result = EAGAIN;
1104 goto ExitRoutine;
1105 }
1106
1107 /* we depend on a list of user_addr_t's so we need to munge and expand */
1108 /* when these pointers came from a 32-bit process */
1109 if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1110 /* position to the last entry and work back from there */
1111 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1112 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1113 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1114 *my_addrp = (user_addr_t) (*my_ptrp);
1115 }
1116 }
1117
1118 /* process list of aio requests */
1119 for ( i = 0; i < uap->nent; i++ ) {
1120 user_addr_t my_aiocbp;
1121
1122 *(entryp_listp + i) = NULL;
1123 my_aiocbp = *(aiocbpp + i);
1124
1125 /* NULL elements are legal so check for 'em */
1126 if ( my_aiocbp == USER_ADDR_NULL )
1127 continue;
1128
1129 if ( uap->mode == LIO_NOWAIT )
1130 result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1131 group_tag, (entryp_listp + i) );
1132 else
1133 result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1134 (entryp_listp + i) );
1135
1136 if ( result != 0 && call_result == -1 )
1137 call_result = result;
1138 }
1139
1140 /*
1141 * we need to protect this section since we do not want any of these grouped
1142 * IO requests to begin until we have them all on the queue.
1143 */
1144 AIO_LOCK;
1145 for ( i = 0; i < uap->nent; i++ ) {
1146 aio_workq_entry *entryp;
1147
1148 /* NULL elements are legal so check for 'em */
1149 entryp = *(entryp_listp + i);
1150 if ( entryp == NULL )
1151 continue;
1152
1153 /* check our aio limits to throttle bad or rude user land behavior */
1154 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1155 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1156 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1157 vm_map_t my_map;
1158
1159 my_map = entryp->aio_map;
1160 entryp->aio_map = VM_MAP_NULL;
1161 if ( call_result == -1 )
1162 call_result = EAGAIN;
1163 AIO_UNLOCK;
1164 aio_free_request( entryp, my_map );
1165 AIO_LOCK;
1166 continue;
1167 }
1168
1169 /* place the request on the appropriate queue */
1170 if ( uap->mode == LIO_NOWAIT ) {
1171 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1172 aio_anchor.aio_async_workq_count++;
1173
1174 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1175 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1176 }
1177 else {
1178 TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1179 aio_anchor.lio_sync_workq_count++;
1180 }
1181 }
1182
1183 if ( uap->mode == LIO_NOWAIT ) {
1184 /* caller does not want to wait so we'll fire off a worker thread and return */
1185 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1186 }
1187 else {
1188 aio_workq_entry *entryp;
1189 int error;
1190
1191 /*
1192 * mode is LIO_WAIT - handle the IO requests now.
1193 */
1194 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1195 while ( entryp != NULL ) {
1196 if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1197
1198 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1199 aio_anchor.lio_sync_workq_count--;
1200 AIO_UNLOCK;
1201
1202 if ( (entryp->flags & AIO_READ) != 0 ) {
1203 error = do_aio_read( entryp );
1204 }
1205 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1206 error = do_aio_write( entryp );
1207 }
1208 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1209 error = do_aio_fsync( entryp );
1210 }
1211 else {
1212 printf( "%s - unknown aio request - flags 0x%02X \n",
1213 __FUNCTION__, entryp->flags );
1214 error = EINVAL;
1215 }
1216 entryp->errorval = error;
1217 if ( error != 0 && call_result == -1 )
1218 call_result = EIO;
1219
1220 AIO_LOCK;
1221 /* we're done with the IO request so move it on the done queue */
1222 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1223 aio_anchor.aio_done_count++;
1224 p->aio_done_count++;
1225
1226 /* need to start over since lio_sync_workq may have been changed while we */
1227 /* were away doing the IO. */
1228 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1229 continue;
1230 } /* p == entryp->procp */
1231
1232 entryp = TAILQ_NEXT( entryp, aio_workq_link );
1233 } /* while ( entryp != NULL ) */
1234 } /* uap->mode == LIO_WAIT */
1235 AIO_UNLOCK;
1236
1237 /* call_result == -1 means we had no trouble queueing up requests */
1238 if ( call_result == -1 ) {
1239 call_result = 0;
1240 *retval = 0;
1241 }
1242
1243 ExitRoutine:
1244 if ( entryp_listp != NULL )
1245 FREE( entryp_listp, M_TEMP );
1246 if ( aiocbpp != NULL )
1247 FREE( aiocbpp, M_TEMP );
1248
1249 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1250 (int)p, call_result, 0, 0, 0 );
1251
1252 return( call_result );
1253
1254 } /* lio_listio */
1255
1256
1257 /*
1258 * aio worker thread. this is where all the real work gets done.
1259 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1260 * after new work is queued up.
1261 */
1262
1263 static void
1264 aio_work_thread( void )
1265 {
1266 aio_workq_entry *entryp;
1267
1268 for( ;; ) {
1269 AIO_LOCK;
1270 entryp = aio_get_some_work();
1271 if ( entryp == NULL ) {
1272 /*
1273 * aio worker threads wait for some work to get queued up
1274 * by aio_queue_async_request. Once some work gets queued
1275 * it will wake up one of these worker threads just before
1276 * returning to our caller in user land.
1277 */
1278 assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1279 AIO_UNLOCK;
1280
1281 thread_block( (thread_continue_t)aio_work_thread );
1282 /* NOT REACHED */
1283 }
1284 else {
1285 int error;
1286 vm_map_t currentmap;
1287 vm_map_t oldmap = VM_MAP_NULL;
1288 task_t oldaiotask = TASK_NULL;
1289 struct uthread *uthreadp = NULL;
1290
1291 AIO_UNLOCK;
1292
1293 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1294 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1295
1296 /*
1297 * Assume the target's address space identity for the duration
1298 * of the IO.
1299 */
1300 currentmap = get_task_map( (current_proc())->task );
1301 if ( currentmap != entryp->aio_map ) {
1302 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1303 oldaiotask = uthreadp->uu_aio_task;
1304 uthreadp->uu_aio_task = entryp->procp->task;
1305 oldmap = vm_map_switch( entryp->aio_map );
1306 }
1307
1308 if ( (entryp->flags & AIO_READ) != 0 ) {
1309 error = do_aio_read( entryp );
1310 }
1311 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1312 error = do_aio_write( entryp );
1313 }
1314 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1315 error = do_aio_fsync( entryp );
1316 }
1317 else {
1318 printf( "%s - unknown aio request - flags 0x%02X \n",
1319 __FUNCTION__, entryp->flags );
1320 error = EINVAL;
1321 }
1322 entryp->errorval = error;
1323 if ( currentmap != entryp->aio_map ) {
1324 (void) vm_map_switch( oldmap );
1325 uthreadp->uu_aio_task = oldaiotask;
1326 }
1327
1328 /* we're done with the IO request so pop it off the active queue and */
1329 /* push it on the done queue */
1330 AIO_LOCK;
1331 TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1332 aio_anchor.aio_active_count--;
1333 entryp->procp->aio_active_count--;
1334 TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1335 aio_anchor.aio_done_count++;
1336 entryp->procp->aio_done_count++;
1337 entryp->flags |= AIO_COMPLETION;
1338
1339 /* remove our reference to the user land map. */
1340 if ( VM_MAP_NULL != entryp->aio_map ) {
1341 vm_map_t my_map;
1342
1343 my_map = entryp->aio_map;
1344 entryp->aio_map = VM_MAP_NULL;
1345 AIO_UNLOCK; /* must unlock before calling vm_map_deallocate() */
1346 vm_map_deallocate( my_map );
1347 }
1348 else {
1349 AIO_UNLOCK;
1350 }
1351
1352 do_aio_completion( entryp );
1353
1354 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1355 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1356 entryp->returnval, 0 );
1357
1358 AIO_LOCK;
1359 entryp->flags &= ~AIO_COMPLETION;
1360 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1361 vm_map_t my_map;
1362
1363 my_map = entryp->aio_map;
1364 entryp->aio_map = VM_MAP_NULL;
1365 AIO_UNLOCK;
1366 aio_free_request( entryp, my_map );
1367 }
1368 else
1369 AIO_UNLOCK;
1370 }
1371 } /* for ( ;; ) */
1372
1373 /* NOT REACHED */
1374
1375 } /* aio_work_thread */
1376
1377
1378 /*
1379 * aio_get_some_work - get the next async IO request that is ready to be executed.
1380 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1381 * IO requests at the time the aio_fsync call came in have completed.
1382 * NOTE - AIO_LOCK must be held by caller
1383 */
1384
1385 static aio_workq_entry *
1386 aio_get_some_work( void )
1387 {
1388 aio_workq_entry *entryp;
1389
1390 /* pop some work off the work queue and add to our active queue */
1391 for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1392 entryp != NULL;
1393 entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1394
1395 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1396 /* leave aio_fsync calls on the work queue if there are IO */
1397 /* requests on the active queue for the same file descriptor. */
1398 if ( aio_delay_fsync_request( entryp ) ) {
1399
1400 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1401 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1402 continue;
1403 }
1404 }
1405 break;
1406 }
1407
1408 if ( entryp != NULL ) {
1409 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1410 aio_anchor.aio_async_workq_count--;
1411 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1412 aio_anchor.aio_active_count++;
1413 entryp->procp->aio_active_count++;
1414 }
1415
1416 return( entryp );
1417
1418 } /* aio_get_some_work */
1419
1420
1421 /*
1422 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1423 * this time. Delay will happen when there are any active IOs for the same file
1424 * descriptor that were queued at time the aio_sync call was queued.
1425 * NOTE - AIO_LOCK must be held by caller
1426 */
1427 static boolean_t
1428 aio_delay_fsync_request( aio_workq_entry *entryp )
1429 {
1430 aio_workq_entry *my_entryp;
1431
1432 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1433 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1434 entryp->uaiocbp == my_entryp->fsyncp &&
1435 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1436 return( TRUE );
1437 }
1438 }
1439
1440 return( FALSE );
1441
1442 } /* aio_delay_fsync_request */
1443
1444
1445 /*
1446 * aio_queue_async_request - queue up an async IO request on our work queue then
1447 * wake up one of our worker threads to do the actual work. We get a reference
1448 * to our caller's user land map in order to keep it around while we are
1449 * processing the request.
1450 */
1451
1452 static int
1453 aio_queue_async_request( struct proc *procp, user_addr_t aiocbp, int kindOfIO )
1454 {
1455 aio_workq_entry *entryp;
1456 int result;
1457
1458 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1459 if ( entryp == NULL ) {
1460 result = EAGAIN;
1461 goto error_exit;
1462 }
1463 bzero( entryp, sizeof(*entryp) );
1464
1465 /* fill in the rest of the aio_workq_entry */
1466 entryp->procp = procp;
1467 entryp->uaiocbp = aiocbp;
1468 entryp->flags |= kindOfIO;
1469 entryp->aio_map = VM_MAP_NULL;
1470
1471 if ( !IS_64BIT_PROCESS(procp) ) {
1472 struct aiocb aiocb32;
1473
1474 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1475 if ( result == 0 )
1476 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1477 } else
1478 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1479
1480 if ( result != 0 ) {
1481 result = EAGAIN;
1482 goto error_exit;
1483 }
1484
1485 /* do some more validation on the aiocb and embedded file descriptor */
1486 result = aio_validate( entryp );
1487 if ( result != 0 )
1488 goto error_exit;
1489
1490 /* get a reference to the user land map in order to keep it around */
1491 entryp->aio_map = get_task_map( procp->task );
1492 vm_map_reference( entryp->aio_map );
1493
1494 AIO_LOCK;
1495
1496 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1497 AIO_UNLOCK;
1498 result = EAGAIN;
1499 goto error_exit;
1500 }
1501
1502 /* check our aio limits to throttle bad or rude user land behavior */
1503 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1504 aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1505 AIO_UNLOCK;
1506 result = EAGAIN;
1507 goto error_exit;
1508 }
1509
1510 /*
1511 * aio_fsync calls sync up all async IO requests queued at the time
1512 * the aio_fsync call was made. So we mark each currently queued async
1513 * IO with a matching file descriptor as must complete before we do the
1514 * fsync. We set the fsyncp field of each matching async IO
1515 * request with the aiocb pointer passed in on the aio_fsync call to
1516 * know which IOs must complete before we process the aio_fsync call.
1517 */
1518 if ( (kindOfIO & AIO_FSYNC) != 0 )
1519 aio_mark_requests( entryp );
1520
1521 /* queue up on our aio asynchronous work queue */
1522 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1523 aio_anchor.aio_async_workq_count++;
1524
1525 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1526 AIO_UNLOCK;
1527
1528 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1529 (int)procp, (int)aiocbp, 0, 0, 0 );
1530
1531 return( 0 );
1532
1533 error_exit:
1534 if ( entryp != NULL ) {
1535 /* this entry has not been queued up so no worries about unlocked */
1536 /* state and aio_map */
1537 aio_free_request( entryp, entryp->aio_map );
1538 }
1539
1540 return( result );
1541
1542 } /* aio_queue_async_request */
1543
1544
1545 /*
1546 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1547 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1548 * our caller. We get a reference to our caller's user land map in order to keep
1549 * it around while we are processing the request.
1550 * lio_listio calls behave differently at completion they do completion notification
1551 * when all async IO requests have completed. We use group_tag to tag IO requests
1552 * that behave in the delay notification manner.
1553 */
1554
1555 static int
1556 lio_create_async_entry( struct proc *procp, user_addr_t aiocbp,
1557 user_addr_t sigp, long group_tag,
1558 aio_workq_entry **entrypp )
1559 {
1560 aio_workq_entry *entryp;
1561 int result;
1562
1563 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1564 if ( entryp == NULL ) {
1565 result = EAGAIN;
1566 goto error_exit;
1567 }
1568 bzero( entryp, sizeof(*entryp) );
1569
1570 /* fill in the rest of the aio_workq_entry */
1571 entryp->procp = procp;
1572 entryp->uaiocbp = aiocbp;
1573 entryp->flags |= AIO_LIO;
1574 entryp->group_tag = group_tag;
1575 entryp->aio_map = VM_MAP_NULL;
1576
1577 if ( !IS_64BIT_PROCESS(procp) ) {
1578 struct aiocb aiocb32;
1579
1580 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1581 if ( result == 0 )
1582 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1583 } else
1584 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1585
1586 if ( result != 0 ) {
1587 result = EAGAIN;
1588 goto error_exit;
1589 }
1590
1591 /* look for lio_listio LIO_NOP requests and ignore them. */
1592 /* Not really an error, but we need to free our aio_workq_entry. */
1593 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1594 result = 0;
1595 goto error_exit;
1596 }
1597
1598 /* use sigevent passed in to lio_listio for each of our calls, but only */
1599 /* do completion notification after the last request completes. */
1600 if ( sigp != USER_ADDR_NULL ) {
1601 if ( !IS_64BIT_PROCESS(procp) ) {
1602 struct sigevent sigevent32;
1603
1604 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1605 if ( result == 0 ) {
1606 /* also need to munge aio_sigevent since it contains pointers */
1607 /* special case here. since we do not know if sigev_value is an */
1608 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
1609 /* means if we send this info back to user space we need to remember */
1610 /* sigev_value was not expanded for the 32-bit case. */
1611 /* NOTE - this does NOT affect us since we don't support sigev_value */
1612 /* yet in the aio context. */
1613 //LP64
1614 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1615 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1616 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1617 sigevent32.sigev_value.sival_int;
1618 entryp->aiocb.aio_sigevent.sigev_notify_function =
1619 CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1620 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1621 CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1622 }
1623 } else
1624 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1625
1626 if ( result != 0 ) {
1627 result = EAGAIN;
1628 goto error_exit;
1629 }
1630 }
1631
1632 /* do some more validation on the aiocb and embedded file descriptor */
1633 result = aio_validate( entryp );
1634 if ( result != 0 )
1635 goto error_exit;
1636
1637 /* get a reference to the user land map in order to keep it around */
1638 entryp->aio_map = get_task_map( procp->task );
1639 vm_map_reference( entryp->aio_map );
1640
1641 *entrypp = entryp;
1642 return( 0 );
1643
1644 error_exit:
1645 if ( entryp != NULL )
1646 zfree( aio_workq_zonep, entryp );
1647
1648 return( result );
1649
1650 } /* lio_create_async_entry */
1651
1652
1653 /*
1654 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1655 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1656 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1657 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1658 * NOTE - AIO_LOCK must be held by caller
1659 */
1660
1661 static void
1662 aio_mark_requests( aio_workq_entry *entryp )
1663 {
1664 aio_workq_entry *my_entryp;
1665
1666 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1667 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1668 my_entryp->fsyncp = entryp->uaiocbp;
1669 }
1670 }
1671
1672 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1673 if ( entryp->procp == my_entryp->procp &&
1674 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1675 my_entryp->fsyncp = entryp->uaiocbp;
1676 }
1677 }
1678
1679 } /* aio_mark_requests */
1680
1681
1682 /*
1683 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1684 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1685 * our caller.
1686 * lio_listio calls behave differently at completion they do completion notification
1687 * when all async IO requests have completed. We use group_tag to tag IO requests
1688 * that behave in the delay notification manner.
1689 */
1690
1691 static int
1692 lio_create_sync_entry( struct proc *procp, user_addr_t aiocbp,
1693 long group_tag, aio_workq_entry **entrypp )
1694 {
1695 aio_workq_entry *entryp;
1696 int result;
1697
1698 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1699 if ( entryp == NULL ) {
1700 result = EAGAIN;
1701 goto error_exit;
1702 }
1703 bzero( entryp, sizeof(*entryp) );
1704
1705 /* fill in the rest of the aio_workq_entry */
1706 entryp->procp = procp;
1707 entryp->uaiocbp = aiocbp;
1708 entryp->flags |= AIO_LIO;
1709 entryp->group_tag = group_tag;
1710 entryp->aio_map = VM_MAP_NULL;
1711
1712 if ( !IS_64BIT_PROCESS(procp) ) {
1713 struct aiocb aiocb32;
1714
1715 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1716 if ( result == 0 )
1717 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1718 } else
1719 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1720
1721 if ( result != 0 ) {
1722 result = EAGAIN;
1723 goto error_exit;
1724 }
1725
1726 /* look for lio_listio LIO_NOP requests and ignore them. */
1727 /* Not really an error, but we need to free our aio_workq_entry. */
1728 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1729 result = 0;
1730 goto error_exit;
1731 }
1732
1733 result = aio_validate( entryp );
1734 if ( result != 0 ) {
1735 goto error_exit;
1736 }
1737
1738 *entrypp = entryp;
1739 return( 0 );
1740
1741 error_exit:
1742 if ( entryp != NULL )
1743 zfree( aio_workq_zonep, entryp );
1744
1745 return( result );
1746
1747 } /* lio_create_sync_entry */
1748
1749
1750 /*
1751 * aio_free_request - remove our reference on the user land map and
1752 * free the work queue entry resources.
1753 * We are not holding the lock here thus aio_map is passed in and
1754 * zeroed while we did have the lock.
1755 */
1756
1757 static int
1758 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1759 {
1760 /* remove our reference to the user land map. */
1761 if ( VM_MAP_NULL != the_map ) {
1762 vm_map_deallocate( the_map );
1763 }
1764
1765 zfree( aio_workq_zonep, entryp );
1766
1767 return( 0 );
1768
1769 } /* aio_free_request */
1770
1771
1772 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1773 */
1774
1775 static int
1776 aio_validate( aio_workq_entry *entryp )
1777 {
1778 struct fileproc *fp;
1779 int flag;
1780 int result;
1781
1782 result = 0;
1783
1784 if ( (entryp->flags & AIO_LIO) != 0 ) {
1785 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1786 entryp->flags |= AIO_READ;
1787 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1788 entryp->flags |= AIO_WRITE;
1789 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1790 return( 0 );
1791 else
1792 return( EINVAL );
1793 }
1794
1795 flag = FREAD;
1796 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1797 flag = FWRITE;
1798 }
1799
1800 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1801 // LP64todo - does max value for aio_nbytes need to grow?
1802 if ( entryp->aiocb.aio_nbytes > INT_MAX ||
1803 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1804 entryp->aiocb.aio_offset < 0 )
1805 return( EINVAL );
1806 }
1807
1808 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1809 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1810 * sigev_notify_function, and sigev_notify_attributes are ignored.
1811 */
1812 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1813 int signum;
1814 /* make sure we have a valid signal number */
1815 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1816 if ( signum <= 0 || signum >= NSIG ||
1817 signum == SIGKILL || signum == SIGSTOP )
1818 return (EINVAL);
1819 }
1820 else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1821 return (EINVAL);
1822
1823 /* validate the file descriptor and that the file was opened
1824 * for the appropriate read / write access.
1825 */
1826 proc_fdlock(entryp->procp);
1827
1828 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1829 if ( result == 0 ) {
1830 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1831 /* we don't have read or write access */
1832 result = EBADF;
1833 }
1834 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1835 /* this is not a file */
1836 result = ESPIPE;
1837 } else
1838 fp->f_flags |= FP_AIOISSUED;
1839
1840 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1841 }
1842 else {
1843 result = EBADF;
1844 }
1845
1846 proc_fdunlock(entryp->procp);
1847
1848 return( result );
1849
1850 } /* aio_validate */
1851
1852
1853 /*
1854 * aio_get_process_count - runs through our queues that hold outstanding
1855 * async IO reqests and totals up number of requests for the given
1856 * process.
1857 * NOTE - caller must hold aio lock!
1858 */
1859
1860 static int
1861 aio_get_process_count( struct proc *procp )
1862 {
1863 aio_workq_entry *entryp;
1864 int count;
1865
1866 /* begin with count of completed async IO requests for this process */
1867 count = procp->aio_done_count;
1868
1869 /* add in count of active async IO requests for this process */
1870 count += procp->aio_active_count;
1871
1872 /* look for matches on our queue of asynchronous todo work */
1873 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1874 if ( procp == entryp->procp ) {
1875 count++;
1876 }
1877 }
1878
1879 /* look for matches on our queue of synchronous todo work */
1880 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1881 if ( procp == entryp->procp ) {
1882 count++;
1883 }
1884 }
1885
1886 return( count );
1887
1888 } /* aio_get_process_count */
1889
1890
1891 /*
1892 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1893 * NOTE - caller must hold aio lock!
1894 */
1895
1896 static int
1897 aio_get_all_queues_count( void )
1898 {
1899 int count;
1900
1901 count = aio_anchor.aio_async_workq_count;
1902 count += aio_anchor.lio_sync_workq_count;
1903 count += aio_anchor.aio_active_count;
1904 count += aio_anchor.aio_done_count;
1905
1906 return( count );
1907
1908 } /* aio_get_all_queues_count */
1909
1910
1911 /*
1912 * do_aio_completion. Handle async IO completion.
1913 */
1914
1915 static void
1916 do_aio_completion( aio_workq_entry *entryp )
1917 {
1918 /* signal user land process if appropriate */
1919 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1920 (entryp->flags & AIO_DISABLE) == 0 ) {
1921
1922 /*
1923 * if group_tag is non zero then make sure this is the last IO request
1924 * in the group before we signal.
1925 */
1926 if ( entryp->group_tag == 0 ||
1927 (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1928 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1929 (int)entryp->procp, (int)entryp->uaiocbp,
1930 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1931
1932 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1933 return;
1934 }
1935 }
1936
1937 /*
1938 * need to handle case where a process is trying to exit, exec, or close
1939 * and is currently waiting for active aio requests to complete. If
1940 * AIO_WAITING is set then we need to look to see if there are any
1941 * other requests in the active queue for this process. If there are
1942 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1943 * there are some still active then do nothing - we only want to wakeup
1944 * when all active aio requests for the process are complete.
1945 */
1946 if ( (entryp->flags & AIO_WAITING) != 0 ) {
1947 int active_requests;
1948
1949 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1950 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1951
1952 AIO_LOCK;
1953 active_requests = aio_active_requests_for_process( entryp->procp );
1954 //AIO_UNLOCK;
1955 if ( active_requests < 1 ) {
1956 /* no active aio requests for this process, continue exiting */
1957 wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1958
1959 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1960 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1961 }
1962 AIO_UNLOCK;
1963 return;
1964 }
1965
1966 /*
1967 * aio_suspend case when a signal was not requested. In that scenario we
1968 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1969 * NOTE - the assumption here is that this wakeup call is inexpensive.
1970 * we really only need to do this when an aio_suspend call is pending.
1971 * If we find the wakeup call should be avoided we could mark the
1972 * async IO requests given in the list provided by aio_suspend and only
1973 * call wakeup for them. If we do mark them we should unmark them after
1974 * the aio_suspend wakes up.
1975 */
1976 AIO_LOCK;
1977 wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1978 AIO_UNLOCK;
1979
1980 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1981 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1982
1983 return;
1984
1985 } /* do_aio_completion */
1986
1987
1988 /*
1989 * aio_last_group_io - checks to see if this is the last unfinished IO request
1990 * for the given group_tag. Returns TRUE if there are no other active IO
1991 * requests for this group or FALSE if the are active IO requests
1992 * NOTE - AIO_LOCK must be held by caller
1993 */
1994
1995 static boolean_t
1996 aio_last_group_io( aio_workq_entry *entryp )
1997 {
1998 aio_workq_entry *my_entryp;
1999
2000 /* look for matches on our queue of active async IO requests */
2001 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
2002 if ( my_entryp->group_tag == entryp->group_tag )
2003 return( FALSE );
2004 }
2005
2006 /* look for matches on our queue of asynchronous todo work */
2007 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2008 if ( my_entryp->group_tag == entryp->group_tag )
2009 return( FALSE );
2010 }
2011
2012 /* look for matches on our queue of synchronous todo work */
2013 TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2014 if ( my_entryp->group_tag == entryp->group_tag )
2015 return( FALSE );
2016 }
2017
2018 return( TRUE );
2019
2020 } /* aio_last_group_io */
2021
2022
2023 /*
2024 * do_aio_read
2025 */
2026 static int
2027 do_aio_read( aio_workq_entry *entryp )
2028 {
2029 struct fileproc *fp;
2030 int error;
2031
2032 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2033 return(error);
2034 if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2035 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2036 return(EBADF);
2037 }
2038 if ( fp != NULL ) {
2039 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
2040 entryp->aiocb.aio_buf,
2041 entryp->aiocb.aio_nbytes,
2042 entryp->aiocb.aio_offset, FOF_OFFSET,
2043 &entryp->returnval );
2044 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2045 }
2046 else {
2047 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2048 error = EBADF;
2049 }
2050
2051 return( error );
2052
2053 } /* do_aio_read */
2054
2055
2056 /*
2057 * do_aio_write
2058 */
2059 static int
2060 do_aio_write( aio_workq_entry *entryp )
2061 {
2062 struct fileproc *fp;
2063 int error;
2064
2065 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2066 return(error);
2067 if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2068 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2069 return(EBADF);
2070 }
2071 if ( fp != NULL ) {
2072 /* NB: tell dofilewrite the offset, and to use the proc cred */
2073 error = dofilewrite( entryp->procp,
2074 fp,
2075 entryp->aiocb.aio_fildes,
2076 entryp->aiocb.aio_buf,
2077 entryp->aiocb.aio_nbytes,
2078 entryp->aiocb.aio_offset,
2079 FOF_OFFSET | FOF_PCRED,
2080 &entryp->returnval);
2081
2082 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2083 }
2084 else {
2085 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2086 error = EBADF;
2087 }
2088
2089 return( error );
2090
2091 } /* do_aio_write */
2092
2093
2094 /*
2095 * aio_active_requests_for_process - return number of active async IO
2096 * requests for the given process.
2097 * NOTE - caller must hold aio lock!
2098 */
2099
2100 static int
2101 aio_active_requests_for_process( struct proc *procp )
2102 {
2103
2104 return( procp->aio_active_count );
2105
2106 } /* aio_active_requests_for_process */
2107
2108
2109 /*
2110 * do_aio_fsync
2111 */
2112 static int
2113 do_aio_fsync( aio_workq_entry *entryp )
2114 {
2115 struct vfs_context context;
2116 struct vnode *vp;
2117 struct fileproc *fp;
2118 int error;
2119
2120 /*
2121 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2122 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2123 * The following was shamelessly extracted from fsync() implementation.
2124 */
2125
2126 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2127 if ( error == 0 ) {
2128 if ( (error = vnode_getwithref(vp)) ) {
2129 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2130 entryp->returnval = -1;
2131 return(error);
2132 }
2133 context.vc_proc = entryp->procp;
2134 context.vc_ucred = fp->f_fglob->fg_cred;
2135
2136 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2137
2138 (void)vnode_put(vp);
2139
2140 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2141 }
2142 if ( error != 0 )
2143 entryp->returnval = -1;
2144
2145 return( error );
2146
2147 } /* do_aio_fsync */
2148
2149
2150 /*
2151 * is_already_queued - runs through our queues to see if the given
2152 * aiocbp / process is there. Returns TRUE if there is a match
2153 * on any of our aio queues.
2154 * NOTE - callers must hold aio lock!
2155 */
2156
2157 static boolean_t
2158 is_already_queued( struct proc *procp,
2159 user_addr_t aiocbp )
2160 {
2161 aio_workq_entry *entryp;
2162 boolean_t result;
2163
2164 result = FALSE;
2165
2166 /* look for matches on our queue of async IO requests that have completed */
2167 TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2168 if ( aiocbp == entryp->uaiocbp ) {
2169 result = TRUE;
2170 goto ExitThisRoutine;
2171 }
2172 }
2173
2174 /* look for matches on our queue of active async IO requests */
2175 TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2176 if ( aiocbp == entryp->uaiocbp ) {
2177 result = TRUE;
2178 goto ExitThisRoutine;
2179 }
2180 }
2181
2182 /* look for matches on our queue of asynchronous todo work */
2183 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2184 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2185 result = TRUE;
2186 goto ExitThisRoutine;
2187 }
2188 }
2189
2190 /* look for matches on our queue of synchronous todo work */
2191 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2192 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2193 result = TRUE;
2194 goto ExitThisRoutine;
2195 }
2196 }
2197
2198 ExitThisRoutine:
2199 return( result );
2200
2201 } /* is_already_queued */
2202
2203
2204 /*
2205 * aio initialization
2206 */
2207 __private_extern__ void
2208 aio_init( void )
2209 {
2210 int i;
2211
2212 aio_lock_grp_attr = lck_grp_attr_alloc_init();
2213 aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2214 aio_lock_attr = lck_attr_alloc_init();
2215
2216 aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2217
2218 AIO_LOCK;
2219 TAILQ_INIT( &aio_anchor.aio_async_workq );
2220 TAILQ_INIT( &aio_anchor.lio_sync_workq );
2221 aio_anchor.aio_async_workq_count = 0;
2222 aio_anchor.lio_sync_workq_count = 0;
2223 aio_anchor.aio_active_count = 0;
2224 aio_anchor.aio_done_count = 0;
2225 AIO_UNLOCK;
2226
2227 i = sizeof( aio_workq_entry );
2228 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2229
2230 _aio_create_worker_threads( aio_worker_threads );
2231
2232 return;
2233
2234 } /* aio_init */
2235
2236
2237 /*
2238 * aio worker threads created here.
2239 */
2240 __private_extern__ void
2241 _aio_create_worker_threads( int num )
2242 {
2243 int i;
2244
2245 /* create some worker threads to handle the async IO requests */
2246 for ( i = 0; i < num; i++ ) {
2247 thread_t myThread;
2248
2249 myThread = kernel_thread( kernel_task, aio_work_thread );
2250 if ( THREAD_NULL == myThread ) {
2251 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2252 }
2253 }
2254
2255 return;
2256
2257 } /* _aio_create_worker_threads */
2258
2259 /*
2260 * Return the current activation utask
2261 */
2262 task_t
2263 get_aiotask(void)
2264 {
2265 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2266 }
2267
2268
2269 /*
2270 * In the case of an aiocb from a
2271 * 32-bit process we need to expand some longs and pointers to the correct
2272 * sizes in order to let downstream code always work on the same type of
2273 * aiocb (in our case that is a user_aiocb)
2274 */
2275 static void
2276 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2277 {
2278 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2279 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2280 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2281 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2282 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2283 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2284
2285 /* special case here. since we do not know if sigev_value is an */
2286 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2287 /* means if we send this info back to user space we need to remember */
2288 /* sigev_value was not expanded for the 32-bit case. */
2289 /* NOTE - this does NOT affect us since we don't support sigev_value */
2290 /* yet in the aio context. */
2291 //LP64
2292 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2293 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2294 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2295 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2296 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2297 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2298 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2299 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2300 }