]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_aio.c
xnu-792.17.14.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
1 /*
2 * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 /*
31 * todo:
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
38 */
39
40
41 /*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
57 #include <sys/user.h>
58
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
61
62 #include <machine/limits.h>
63
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/zalloc.h>
67 #include <kern/task.h>
68 #include <kern/sched_prim.h>
69
70 #include <vm/vm_map.h>
71
72 #include <sys/kdebug.h>
73 #define AIO_work_queued 1
74 #define AIO_worker_wake 2
75 #define AIO_completion_sig 3
76 #define AIO_completion_cleanup_wait 4
77 #define AIO_completion_cleanup_wake 5
78 #define AIO_completion_suspend_wake 6
79 #define AIO_fsync_delay 7
80 #define AIO_cancel 10
81 #define AIO_cancel_async_workq 11
82 #define AIO_cancel_sync_workq 12
83 #define AIO_cancel_activeq 13
84 #define AIO_cancel_doneq 14
85 #define AIO_fsync 20
86 #define AIO_read 30
87 #define AIO_write 40
88 #define AIO_listio 50
89 #define AIO_error 60
90 #define AIO_error_val 61
91 #define AIO_error_activeq 62
92 #define AIO_error_workq 63
93 #define AIO_return 70
94 #define AIO_return_val 71
95 #define AIO_return_activeq 72
96 #define AIO_return_workq 73
97 #define AIO_exec 80
98 #define AIO_exit 90
99 #define AIO_exit_sleep 91
100 #define AIO_close 100
101 #define AIO_close_sleep 101
102 #define AIO_suspend 110
103 #define AIO_suspend_sleep 111
104 #define AIO_worker_thread 120
105
106 #if 0
107 #undef KERNEL_DEBUG
108 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
109 #endif
110
111 /*
112 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
113 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
114 * (proc.aio_activeq) when one of our worker threads start the IO.
115 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
116 * when the IO request completes. The request remains on aio_doneq until
117 * user process calls aio_return or the process exits, either way that is our
118 * trigger to release aio resources.
119 */
120 struct aio_anchor_cb
121 {
122 int aio_async_workq_count; /* entries on aio_async_workq */
123 int lio_sync_workq_count; /* entries on lio_sync_workq */
124 int aio_active_count; /* entries on all active queues (proc.aio_activeq) */
125 int aio_done_count; /* entries on all done queues (proc.aio_doneq) */
126 TAILQ_HEAD( , aio_workq_entry ) aio_async_workq;
127 TAILQ_HEAD( , aio_workq_entry ) lio_sync_workq;
128 };
129 typedef struct aio_anchor_cb aio_anchor_cb;
130
131
132 /*
133 * Notes on aio sleep / wake channels.
134 * We currently pick a couple fields within the proc structure that will allow
135 * us sleep channels that currently do not collide with any other kernel routines.
136 * At this time, for binary compatibility reasons, we cannot create new proc fields.
137 */
138 #define AIO_SUSPEND_SLEEP_CHAN p_estcpu
139 #define AIO_CLEANUP_SLEEP_CHAN p_pctcpu
140
141
142 /*
143 * aysnc IO locking macros used to protect critical sections.
144 */
145 #define AIO_LOCK lck_mtx_lock(aio_lock)
146 #define AIO_UNLOCK lck_mtx_unlock(aio_lock)
147
148
149 /*
150 * LOCAL PROTOTYPES
151 */
152 static int aio_active_requests_for_process( struct proc *procp );
153 static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
154 static int aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
155 static int aio_get_all_queues_count( void );
156 static int aio_get_process_count( struct proc *procp );
157 static aio_workq_entry * aio_get_some_work( void );
158 static boolean_t aio_last_group_io( aio_workq_entry *entryp );
159 static void aio_mark_requests( aio_workq_entry *entryp );
160 static int aio_queue_async_request( struct proc *procp,
161 user_addr_t aiocbp,
162 int kindOfIO );
163 static int aio_validate( aio_workq_entry *entryp );
164 static void aio_work_thread( void );
165 static int do_aio_cancel( struct proc *p,
166 int fd,
167 user_addr_t aiocbp,
168 boolean_t wait_for_completion,
169 boolean_t disable_notification );
170 static void do_aio_completion( aio_workq_entry *entryp );
171 static int do_aio_fsync( aio_workq_entry *entryp );
172 static int do_aio_read( aio_workq_entry *entryp );
173 static int do_aio_write( aio_workq_entry *entryp );
174 static void do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
175 static boolean_t is_already_queued( struct proc *procp,
176 user_addr_t aiocbp );
177 static int lio_create_async_entry( struct proc *procp,
178 user_addr_t aiocbp,
179 user_addr_t sigp,
180 long group_tag,
181 aio_workq_entry **entrypp );
182 static int lio_create_sync_entry( struct proc *procp,
183 user_addr_t aiocbp,
184 long group_tag,
185 aio_workq_entry **entrypp );
186
187
188 /*
189 * EXTERNAL PROTOTYPES
190 */
191
192 /* in ...bsd/kern/sys_generic.c */
193 extern int dofileread( struct proc *p, struct fileproc *fp, int fd,
194 user_addr_t bufp, user_size_t nbyte,
195 off_t offset, int flags, user_ssize_t *retval );
196 extern int dofilewrite( struct proc *p, struct fileproc *fp, int fd,
197 user_addr_t bufp, user_size_t nbyte, off_t offset,
198 int flags, user_ssize_t *retval );
199
200 /*
201 * aio external global variables.
202 */
203 extern int aio_max_requests; /* AIO_MAX - configurable */
204 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
205 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
206
207
208 /*
209 * aio static variables.
210 */
211 static aio_anchor_cb aio_anchor;
212 static lck_mtx_t * aio_lock;
213 static lck_grp_t * aio_lock_grp;
214 static lck_attr_t * aio_lock_attr;
215 static lck_grp_attr_t * aio_lock_grp_attr;
216 static struct zone *aio_workq_zonep;
217
218
219
220
221 /*
222 * aio_cancel - attempt to cancel one or more async IO requests currently
223 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
224 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
225 * is NULL then all outstanding async IO request for the given file
226 * descriptor are cancelled (if possible).
227 */
228
229 int
230 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
231 {
232 struct user_aiocb my_aiocb;
233 int result;
234
235 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
236 (int)p, (int)uap->aiocbp, 0, 0, 0 );
237
238 /* quick check to see if there are any async IO requests queued up */
239 AIO_LOCK;
240 result = aio_get_all_queues_count( );
241 AIO_UNLOCK;
242 if ( result < 1 ) {
243 result = EBADF;
244 goto ExitRoutine;
245 }
246
247 *retval = -1;
248 if ( uap->aiocbp != USER_ADDR_NULL ) {
249 if ( !IS_64BIT_PROCESS(p) ) {
250 struct aiocb aiocb32;
251
252 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
253 if ( result == 0 )
254 do_munge_aiocb( &aiocb32, &my_aiocb );
255 } else
256 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
257
258 if ( result != 0 ) {
259 result = EAGAIN;
260 goto ExitRoutine;
261 }
262
263 /* NOTE - POSIX standard says a mismatch between the file */
264 /* descriptor passed in and the file descriptor embedded in */
265 /* the aiocb causes unspecified results. We return EBADF in */
266 /* that situation. */
267 if ( uap->fd != my_aiocb.aio_fildes ) {
268 result = EBADF;
269 goto ExitRoutine;
270 }
271 }
272 result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
273
274 if ( result != -1 ) {
275 *retval = result;
276 result = 0;
277 goto ExitRoutine;
278 }
279
280 result = EBADF;
281
282 ExitRoutine:
283 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
284 (int)p, (int)uap->aiocbp, result, 0, 0 );
285
286 return( result );
287
288 } /* aio_cancel */
289
290
291 /*
292 * _aio_close - internal function used to clean up async IO requests for
293 * a file descriptor that is closing.
294 * THIS MAY BLOCK.
295 */
296
297 __private_extern__ void
298 _aio_close( struct proc *p, int fd )
299 {
300 int error, count;
301
302 /* quick check to see if there are any async IO requests queued up */
303 AIO_LOCK;
304 count = aio_get_all_queues_count( );
305 AIO_UNLOCK;
306 if ( count < 1 )
307 return;
308
309 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
310 (int)p, fd, 0, 0, 0 );
311
312 /* cancel all async IO requests on our todo queues for this file descriptor */
313 error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
314 if ( error == AIO_NOTCANCELED ) {
315 /*
316 * AIO_NOTCANCELED is returned when we find an aio request for this process
317 * and file descriptor on the active async IO queue. Active requests cannot
318 * be cancelled so we must wait for them to complete. We will get a special
319 * wake up call on our channel used to sleep for ALL active requests to
320 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
321 * when we must wait for all active aio requests.
322 */
323
324 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
325 (int)p, fd, 0, 0, 0 );
326
327 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
328 }
329
330 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
331 (int)p, fd, 0, 0, 0 );
332
333 return;
334
335 } /* _aio_close */
336
337
338 /*
339 * aio_error - return the error status associated with the async IO
340 * request referred to by uap->aiocbp. The error status is the errno
341 * value that would be set by the corresponding IO request (read, wrtie,
342 * fdatasync, or sync).
343 */
344
345 int
346 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
347 {
348 aio_workq_entry *entryp;
349 int error;
350
351 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
352 (int)p, (int)uap->aiocbp, 0, 0, 0 );
353
354 AIO_LOCK;
355
356 /* quick check to see if there are any async IO requests queued up */
357 if ( aio_get_all_queues_count( ) < 1 ) {
358 error = EINVAL;
359 goto ExitRoutine;
360 }
361
362 /* look for a match on our queue of async IO requests that have completed */
363 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
364 if ( entryp->uaiocbp == uap->aiocbp ) {
365 *retval = entryp->errorval;
366 error = 0;
367 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
368 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
369 goto ExitRoutine;
370 }
371 }
372
373 /* look for a match on our queue of active async IO requests */
374 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
375 if ( entryp->uaiocbp == uap->aiocbp ) {
376 *retval = EINPROGRESS;
377 error = 0;
378 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
379 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
380 goto ExitRoutine;
381 }
382 }
383
384 /* look for a match on our queue of todo work */
385 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
386 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
387 *retval = EINPROGRESS;
388 error = 0;
389 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
390 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
391 goto ExitRoutine;
392 }
393 }
394 error = EINVAL;
395
396 ExitRoutine:
397 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
398 (int)p, (int)uap->aiocbp, error, 0, 0 );
399 AIO_UNLOCK;
400
401 return( error );
402
403 } /* aio_error */
404
405
406 /*
407 * aio_fsync - asynchronously force all IO operations associated
408 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
409 * queued at the time of the call to the synchronized completion state.
410 * NOTE - we do not support op O_DSYNC at this point since we do not support the
411 * fdatasync() call.
412 */
413
414 int
415 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
416 {
417 int error;
418 int fsync_kind;
419
420 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
421 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
422
423 *retval = 0;
424 /* 0 := O_SYNC for binary backward compatibility with Panther */
425 if (uap->op == O_SYNC || uap->op == 0)
426 fsync_kind = AIO_FSYNC;
427 #if 0 // we don't support fdatasync() call yet
428 else if ( uap->op == O_DSYNC )
429 fsync_kind = AIO_DSYNC;
430 #endif
431 else {
432 *retval = -1;
433 error = EINVAL;
434 goto ExitRoutine;
435 }
436
437 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
438 if ( error != 0 )
439 *retval = -1;
440
441 ExitRoutine:
442 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
443 (int)p, (int)uap->aiocbp, error, 0, 0 );
444
445 return( error );
446
447 } /* aio_fsync */
448
449
450 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
451 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
452 * (uap->aiocbp->aio_buf).
453 */
454
455 int
456 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
457 {
458 int error;
459
460 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
461 (int)p, (int)uap->aiocbp, 0, 0, 0 );
462
463 *retval = 0;
464
465 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
466 if ( error != 0 )
467 *retval = -1;
468
469 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
470 (int)p, (int)uap->aiocbp, error, 0, 0 );
471
472 return( error );
473
474 } /* aio_read */
475
476
477 /*
478 * aio_return - return the return status associated with the async IO
479 * request referred to by uap->aiocbp. The return status is the value
480 * that would be returned by corresponding IO request (read, wrtie,
481 * fdatasync, or sync). This is where we release kernel resources
482 * held for async IO call associated with the given aiocb pointer.
483 */
484
485 int
486 aio_return( struct proc *p, struct aio_return_args *uap, user_ssize_t *retval )
487 {
488 aio_workq_entry *entryp;
489 int error;
490 boolean_t lock_held;
491
492 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
493 (int)p, (int)uap->aiocbp, 0, 0, 0 );
494
495 AIO_LOCK;
496 lock_held = TRUE;
497 *retval = 0;
498
499 /* quick check to see if there are any async IO requests queued up */
500 if ( aio_get_all_queues_count( ) < 1 ) {
501 error = EINVAL;
502 goto ExitRoutine;
503 }
504
505 /* look for a match on our queue of async IO requests that have completed */
506 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
507 if ( entryp->uaiocbp == uap->aiocbp ) {
508 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
509 aio_anchor.aio_done_count--;
510 p->aio_done_count--;
511
512 *retval = entryp->returnval;
513
514 /* we cannot free requests that are still completing */
515 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
516 vm_map_t my_map;
517
518 my_map = entryp->aio_map;
519 entryp->aio_map = VM_MAP_NULL;
520 AIO_UNLOCK;
521 lock_held = FALSE;
522 aio_free_request( entryp, my_map );
523 }
524 else
525 /* tell completion code to free this request */
526 entryp->flags |= AIO_DO_FREE;
527 error = 0;
528 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
529 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
530 goto ExitRoutine;
531 }
532 }
533
534 /* look for a match on our queue of active async IO requests */
535 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
536 if ( entryp->uaiocbp == uap->aiocbp ) {
537 error = EINPROGRESS;
538 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
539 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
540 goto ExitRoutine;
541 }
542 }
543
544 /* look for a match on our queue of todo work */
545 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
546 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
547 error = EINPROGRESS;
548 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
549 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
550 goto ExitRoutine;
551 }
552 }
553 error = EINVAL;
554
555 ExitRoutine:
556 if ( lock_held )
557 AIO_UNLOCK;
558 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
559 (int)p, (int)uap->aiocbp, error, 0, 0 );
560
561 return( error );
562
563 } /* aio_return */
564
565
566 /*
567 * _aio_exec - internal function used to clean up async IO requests for
568 * a process that is going away due to exec(). We cancel any async IOs
569 * we can and wait for those already active. We also disable signaling
570 * for cancelled or active aio requests that complete.
571 * This routine MAY block!
572 */
573
574 __private_extern__ void
575 _aio_exec( struct proc *p )
576 {
577
578 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
579 (int)p, 0, 0, 0, 0 );
580
581 _aio_exit( p );
582
583 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
584 (int)p, 0, 0, 0, 0 );
585
586 return;
587
588 } /* _aio_exec */
589
590
591 /*
592 * _aio_exit - internal function used to clean up async IO requests for
593 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
594 * we can and wait for those already active. We also disable signaling
595 * for cancelled or active aio requests that complete. This routine MAY block!
596 */
597
598 __private_extern__ void
599 _aio_exit( struct proc *p )
600 {
601 int error, count;
602 aio_workq_entry *entryp;
603
604 /* quick check to see if there are any async IO requests queued up */
605 AIO_LOCK;
606 count = aio_get_all_queues_count( );
607 AIO_UNLOCK;
608 if ( count < 1 ) {
609 return;
610 }
611
612 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
613 (int)p, 0, 0, 0, 0 );
614
615 /*
616 * cancel async IO requests on the todo work queue and wait for those
617 * already active to complete.
618 */
619 error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
620 if ( error == AIO_NOTCANCELED ) {
621 /*
622 * AIO_NOTCANCELED is returned when we find an aio request for this process
623 * on the active async IO queue. Active requests cannot be cancelled so we
624 * must wait for them to complete. We will get a special wake up call on
625 * our channel used to sleep for ALL active requests to complete. This sleep
626 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
627 * active aio requests.
628 */
629
630 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
631 (int)p, 0, 0, 0, 0 );
632
633 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
634 }
635
636 /* release all aio resources used by this process */
637 AIO_LOCK;
638 entryp = TAILQ_FIRST( &p->aio_doneq );
639 while ( entryp != NULL ) {
640 aio_workq_entry *next_entryp;
641
642 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
643 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
644 aio_anchor.aio_done_count--;
645 p->aio_done_count--;
646
647 /* we cannot free requests that are still completing */
648 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
649 vm_map_t my_map;
650
651 my_map = entryp->aio_map;
652 entryp->aio_map = VM_MAP_NULL;
653 AIO_UNLOCK;
654 aio_free_request( entryp, my_map );
655
656 /* need to start over since aio_doneq may have been */
657 /* changed while we were away. */
658 AIO_LOCK;
659 entryp = TAILQ_FIRST( &p->aio_doneq );
660 continue;
661 }
662 else
663 /* tell completion code to free this request */
664 entryp->flags |= AIO_DO_FREE;
665 entryp = next_entryp;
666 }
667 AIO_UNLOCK;
668
669 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
670 (int)p, 0, 0, 0, 0 );
671
672 return;
673
674 } /* _aio_exit */
675
676
677 /*
678 * do_aio_cancel - cancel async IO requests (if possible). We get called by
679 * aio_cancel, close, and at exit.
680 * There are three modes of operation: 1) cancel all async IOs for a process -
681 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
682 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
683 * aiocbp.
684 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
685 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
686 * target async IO requests, and AIO_ALLDONE if all target async IO requests
687 * were already complete.
688 * WARNING - do not deference aiocbp in this routine, it may point to user
689 * land data that has not been copied in (when called from aio_cancel() )
690 */
691
692 static int
693 do_aio_cancel( struct proc *p, int fd, user_addr_t aiocbp,
694 boolean_t wait_for_completion, boolean_t disable_notification )
695 {
696 aio_workq_entry *entryp;
697 int result;
698
699 result = -1;
700
701 /* look for a match on our queue of async todo work. */
702 AIO_LOCK;
703 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
704 while ( entryp != NULL ) {
705 aio_workq_entry *next_entryp;
706
707 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
708 if ( p == entryp->procp ) {
709 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
710 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
711 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
712 /* we found a match so we remove the entry from the */
713 /* todo work queue and place it on the done queue */
714 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
715 aio_anchor.aio_async_workq_count--;
716 entryp->errorval = ECANCELED;
717 entryp->returnval = -1;
718 if ( disable_notification )
719 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
720 result = AIO_CANCELED;
721
722 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
723 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
724
725 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
726 aio_anchor.aio_done_count++;
727 p->aio_done_count++;
728 entryp->flags |= AIO_COMPLETION;
729 AIO_UNLOCK;
730
731 /* do completion processing for this request */
732 do_aio_completion( entryp );
733
734 AIO_LOCK;
735 entryp->flags &= ~AIO_COMPLETION;
736 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
737 vm_map_t my_map;
738
739 my_map = entryp->aio_map;
740 entryp->aio_map = VM_MAP_NULL;
741 AIO_UNLOCK;
742 aio_free_request( entryp, my_map );
743 }
744 else
745 AIO_UNLOCK;
746
747 if ( aiocbp != USER_ADDR_NULL ) {
748 return( result );
749 }
750
751 /* need to start over since aio_async_workq may have been */
752 /* changed while we were away doing completion processing. */
753 AIO_LOCK;
754 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
755 continue;
756 }
757 }
758 entryp = next_entryp;
759 } /* while... */
760
761 /*
762 * look for a match on our queue of synchronous todo work. This will
763 * be a rare occurrence but could happen if a process is terminated while
764 * processing a lio_listio call.
765 */
766 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
767 while ( entryp != NULL ) {
768 aio_workq_entry *next_entryp;
769
770 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
771 if ( p == entryp->procp ) {
772 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
773 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
774 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
775 /* we found a match so we remove the entry from the */
776 /* todo work queue and place it on the done queue */
777 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
778 aio_anchor.lio_sync_workq_count--;
779 entryp->errorval = ECANCELED;
780 entryp->returnval = -1;
781 if ( disable_notification )
782 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
783 result = AIO_CANCELED;
784
785 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
786 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
787
788 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
789 aio_anchor.aio_done_count++;
790 p->aio_done_count++;
791 if ( aiocbp != USER_ADDR_NULL ) {
792 AIO_UNLOCK;
793 return( result );
794 }
795 }
796 }
797 entryp = next_entryp;
798 } /* while... */
799
800 /*
801 * look for a match on our queue of active async IO requests and
802 * return AIO_NOTCANCELED result.
803 */
804 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
805 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
806 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
807 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
808 result = AIO_NOTCANCELED;
809
810 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
811 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
812
813 if ( wait_for_completion )
814 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
815 if ( disable_notification )
816 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
817 if ( aiocbp != USER_ADDR_NULL ) {
818 AIO_UNLOCK;
819 return( result );
820 }
821 }
822 }
823
824 /*
825 * if we didn't find any matches on the todo or active queues then look for a
826 * match on our queue of async IO requests that have completed and if found
827 * return AIO_ALLDONE result.
828 */
829 if ( result == -1 ) {
830 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
831 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
832 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
833 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
834 result = AIO_ALLDONE;
835
836 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
837 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
838
839 if ( aiocbp != USER_ADDR_NULL ) {
840 AIO_UNLOCK;
841 return( result );
842 }
843 }
844 }
845 }
846 AIO_UNLOCK;
847
848 return( result );
849
850 } /* do_aio_cancel */
851
852
853 /*
854 * aio_suspend - suspend the calling thread until at least one of the async
855 * IO operations referenced by uap->aiocblist has completed, until a signal
856 * interrupts the function, or uap->timeoutp time interval (optional) has
857 * passed.
858 * Returns 0 if one or more async IOs have completed else -1 and errno is
859 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
860 * woke us up.
861 */
862
863 int
864 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
865 {
866 int error;
867 int i, count;
868 uint64_t abstime;
869 struct user_timespec ts;
870 aio_workq_entry *entryp;
871 user_addr_t *aiocbpp;
872
873 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
874 (int)p, uap->nent, 0, 0, 0 );
875
876 *retval = -1;
877 abstime = 0;
878 aiocbpp = NULL;
879
880 /* quick check to see if there are any async IO requests queued up */
881 AIO_LOCK;
882 count = aio_get_all_queues_count( );
883 AIO_UNLOCK;
884 if ( count < 1 ) {
885 error = EINVAL;
886 goto ExitThisRoutine;
887 }
888
889 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
890 error = EINVAL;
891 goto ExitThisRoutine;
892 }
893
894 if ( uap->timeoutp != USER_ADDR_NULL ) {
895 if ( proc_is64bit(p) ) {
896 error = copyin( uap->timeoutp, &ts, sizeof(ts) );
897 }
898 else {
899 struct timespec temp;
900 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
901 if ( error == 0 ) {
902 ts.tv_sec = temp.tv_sec;
903 ts.tv_nsec = temp.tv_nsec;
904 }
905 }
906 if ( error != 0 ) {
907 error = EAGAIN;
908 goto ExitThisRoutine;
909 }
910
911 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
912 error = EINVAL;
913 goto ExitThisRoutine;
914 }
915
916 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
917 &abstime );
918 clock_absolutetime_interval_to_deadline( abstime, &abstime );
919 }
920
921 /* we reserve enough space for largest possible pointer size */
922 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
923 if ( aiocbpp == NULL ) {
924 error = EAGAIN;
925 goto ExitThisRoutine;
926 }
927
928 /* copyin our aiocb pointers from list */
929 error = copyin( uap->aiocblist, aiocbpp,
930 proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
931 : (uap->nent * sizeof(uintptr_t)) );
932 if ( error != 0 ) {
933 error = EAGAIN;
934 goto ExitThisRoutine;
935 }
936
937 /* we depend on a list of user_addr_t's so we need to munge and expand */
938 /* when these pointers came from a 32-bit process */
939 if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
940 /* position to the last entry and work back from there */
941 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
942 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
943 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
944 *my_addrp = (user_addr_t) (*my_ptrp);
945 }
946 }
947
948 /* check list of aio requests to see if any have completed */
949 AIO_LOCK;
950 for ( i = 0; i < uap->nent; i++ ) {
951 user_addr_t aiocbp;
952
953 /* NULL elements are legal so check for 'em */
954 aiocbp = *(aiocbpp + i);
955 if ( aiocbp == USER_ADDR_NULL )
956 continue;
957
958 /* return immediately if any aio request in the list is done */
959 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
960 if ( entryp->uaiocbp == aiocbp ) {
961 *retval = 0;
962 error = 0;
963 AIO_UNLOCK;
964 goto ExitThisRoutine;
965 }
966 }
967 } /* for ( ; i < uap->nent; ) */
968
969 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
970 (int)p, uap->nent, 0, 0, 0 );
971
972 /*
973 * wait for an async IO to complete or a signal fires or timeout expires.
974 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
975 * interrupts us. If an async IO completes before a signal fires or our
976 * timeout expires, we get a wakeup call from aio_work_thread().
977 */
978 assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
979 AIO_UNLOCK;
980
981 error = thread_block( THREAD_CONTINUE_NULL );
982
983 if ( error == THREAD_AWAKENED ) {
984 /* got our wakeup call from aio_work_thread() */
985 *retval = 0;
986 error = 0;
987 }
988 else if ( error == THREAD_TIMED_OUT ) {
989 /* our timeout expired */
990 error = EAGAIN;
991 }
992 else {
993 /* we were interrupted */
994 error = EINTR;
995 }
996
997 ExitThisRoutine:
998 if ( aiocbpp != NULL )
999 FREE( aiocbpp, M_TEMP );
1000
1001 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1002 (int)p, uap->nent, error, 0, 0 );
1003
1004 return( error );
1005
1006 } /* aio_suspend */
1007
1008
1009 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1010 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1011 * (uap->aiocbp->aio_buf).
1012 */
1013
1014 int
1015 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
1016 {
1017 int error;
1018
1019 *retval = 0;
1020
1021 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1022 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1023
1024 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1025 if ( error != 0 )
1026 *retval = -1;
1027
1028 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1029 (int)p, (int)uap->aiocbp, error, 0, 0 );
1030
1031 return( error );
1032
1033 } /* aio_write */
1034
1035
1036 /*
1037 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1038 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1039 * The caller gets error and return status for each aiocb in the list via aio_error
1040 * and aio_return. We must keep completed requests until released by the
1041 * aio_return call.
1042 */
1043
1044 int
1045 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
1046 {
1047 int i;
1048 int call_result;
1049 int result;
1050 long group_tag;
1051 aio_workq_entry * *entryp_listp;
1052 user_addr_t *aiocbpp;
1053
1054 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1055 (int)p, uap->nent, uap->mode, 0, 0 );
1056
1057 entryp_listp = NULL;
1058 aiocbpp = NULL;
1059 call_result = -1;
1060 *retval = -1;
1061 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1062 call_result = EINVAL;
1063 goto ExitRoutine;
1064 }
1065
1066 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1067 call_result = EINVAL;
1068 goto ExitRoutine;
1069 }
1070
1071 /*
1072 * we use group_tag to mark IO requests for delayed completion processing
1073 * which means we wait until all IO requests in the group have completed
1074 * before we either return to the caller when mode is LIO_WAIT or signal
1075 * user when mode is LIO_NOWAIT.
1076 */
1077 group_tag = random();
1078
1079 /*
1080 * allocate a list of aio_workq_entry pointers that we will use to queue
1081 * up all our requests at once while holding our lock.
1082 */
1083 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1084 if ( entryp_listp == NULL ) {
1085 call_result = EAGAIN;
1086 goto ExitRoutine;
1087 }
1088
1089 /* we reserve enough space for largest possible pointer size */
1090 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1091 if ( aiocbpp == NULL ) {
1092 call_result = EAGAIN;
1093 goto ExitRoutine;
1094 }
1095
1096 /* copyin our aiocb pointers from list */
1097 result = copyin( uap->aiocblist, aiocbpp,
1098 IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1099 : (uap->nent * sizeof(uintptr_t)) );
1100 if ( result != 0 ) {
1101 call_result = EAGAIN;
1102 goto ExitRoutine;
1103 }
1104
1105 /* we depend on a list of user_addr_t's so we need to munge and expand */
1106 /* when these pointers came from a 32-bit process */
1107 if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1108 /* position to the last entry and work back from there */
1109 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1110 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1111 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1112 *my_addrp = (user_addr_t) (*my_ptrp);
1113 }
1114 }
1115
1116 /* process list of aio requests */
1117 for ( i = 0; i < uap->nent; i++ ) {
1118 user_addr_t my_aiocbp;
1119
1120 *(entryp_listp + i) = NULL;
1121 my_aiocbp = *(aiocbpp + i);
1122
1123 /* NULL elements are legal so check for 'em */
1124 if ( my_aiocbp == USER_ADDR_NULL )
1125 continue;
1126
1127 if ( uap->mode == LIO_NOWAIT )
1128 result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1129 group_tag, (entryp_listp + i) );
1130 else
1131 result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1132 (entryp_listp + i) );
1133
1134 if ( result != 0 && call_result == -1 )
1135 call_result = result;
1136 }
1137
1138 /*
1139 * we need to protect this section since we do not want any of these grouped
1140 * IO requests to begin until we have them all on the queue.
1141 */
1142 AIO_LOCK;
1143 for ( i = 0; i < uap->nent; i++ ) {
1144 aio_workq_entry *entryp;
1145
1146 /* NULL elements are legal so check for 'em */
1147 entryp = *(entryp_listp + i);
1148 if ( entryp == NULL )
1149 continue;
1150
1151 /* check our aio limits to throttle bad or rude user land behavior */
1152 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1153 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1154 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1155 vm_map_t my_map;
1156
1157 my_map = entryp->aio_map;
1158 entryp->aio_map = VM_MAP_NULL;
1159 if ( call_result == -1 )
1160 call_result = EAGAIN;
1161 AIO_UNLOCK;
1162 aio_free_request( entryp, my_map );
1163 AIO_LOCK;
1164 continue;
1165 }
1166
1167 /* place the request on the appropriate queue */
1168 if ( uap->mode == LIO_NOWAIT ) {
1169 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1170 aio_anchor.aio_async_workq_count++;
1171
1172 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1173 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1174 }
1175 else {
1176 TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1177 aio_anchor.lio_sync_workq_count++;
1178 }
1179 }
1180
1181 if ( uap->mode == LIO_NOWAIT ) {
1182 /* caller does not want to wait so we'll fire off a worker thread and return */
1183 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1184 }
1185 else {
1186 aio_workq_entry *entryp;
1187 int error;
1188
1189 /*
1190 * mode is LIO_WAIT - handle the IO requests now.
1191 */
1192 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1193 while ( entryp != NULL ) {
1194 if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1195
1196 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1197 aio_anchor.lio_sync_workq_count--;
1198 AIO_UNLOCK;
1199
1200 if ( (entryp->flags & AIO_READ) != 0 ) {
1201 error = do_aio_read( entryp );
1202 }
1203 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1204 error = do_aio_write( entryp );
1205 }
1206 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1207 error = do_aio_fsync( entryp );
1208 }
1209 else {
1210 printf( "%s - unknown aio request - flags 0x%02X \n",
1211 __FUNCTION__, entryp->flags );
1212 error = EINVAL;
1213 }
1214 entryp->errorval = error;
1215 if ( error != 0 && call_result == -1 )
1216 call_result = EIO;
1217
1218 AIO_LOCK;
1219 /* we're done with the IO request so move it on the done queue */
1220 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1221 aio_anchor.aio_done_count++;
1222 p->aio_done_count++;
1223
1224 /* need to start over since lio_sync_workq may have been changed while we */
1225 /* were away doing the IO. */
1226 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1227 continue;
1228 } /* p == entryp->procp */
1229
1230 entryp = TAILQ_NEXT( entryp, aio_workq_link );
1231 } /* while ( entryp != NULL ) */
1232 } /* uap->mode == LIO_WAIT */
1233 AIO_UNLOCK;
1234
1235 /* call_result == -1 means we had no trouble queueing up requests */
1236 if ( call_result == -1 ) {
1237 call_result = 0;
1238 *retval = 0;
1239 }
1240
1241 ExitRoutine:
1242 if ( entryp_listp != NULL )
1243 FREE( entryp_listp, M_TEMP );
1244 if ( aiocbpp != NULL )
1245 FREE( aiocbpp, M_TEMP );
1246
1247 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1248 (int)p, call_result, 0, 0, 0 );
1249
1250 return( call_result );
1251
1252 } /* lio_listio */
1253
1254
1255 /*
1256 * aio worker thread. this is where all the real work gets done.
1257 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1258 * after new work is queued up.
1259 */
1260
1261 static void
1262 aio_work_thread( void )
1263 {
1264 aio_workq_entry *entryp;
1265
1266 for( ;; ) {
1267 AIO_LOCK;
1268 entryp = aio_get_some_work();
1269 if ( entryp == NULL ) {
1270 /*
1271 * aio worker threads wait for some work to get queued up
1272 * by aio_queue_async_request. Once some work gets queued
1273 * it will wake up one of these worker threads just before
1274 * returning to our caller in user land.
1275 */
1276 assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1277 AIO_UNLOCK;
1278
1279 thread_block( (thread_continue_t)aio_work_thread );
1280 /* NOT REACHED */
1281 }
1282 else {
1283 int error;
1284 vm_map_t currentmap;
1285 vm_map_t oldmap = VM_MAP_NULL;
1286 task_t oldaiotask = TASK_NULL;
1287 struct uthread *uthreadp = NULL;
1288
1289 AIO_UNLOCK;
1290
1291 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1292 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1293
1294 /*
1295 * Assume the target's address space identity for the duration
1296 * of the IO.
1297 */
1298 currentmap = get_task_map( (current_proc())->task );
1299 if ( currentmap != entryp->aio_map ) {
1300 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1301 oldaiotask = uthreadp->uu_aio_task;
1302 uthreadp->uu_aio_task = entryp->procp->task;
1303 oldmap = vm_map_switch( entryp->aio_map );
1304 }
1305
1306 if ( (entryp->flags & AIO_READ) != 0 ) {
1307 error = do_aio_read( entryp );
1308 }
1309 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1310 error = do_aio_write( entryp );
1311 }
1312 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1313 error = do_aio_fsync( entryp );
1314 }
1315 else {
1316 printf( "%s - unknown aio request - flags 0x%02X \n",
1317 __FUNCTION__, entryp->flags );
1318 error = EINVAL;
1319 }
1320 entryp->errorval = error;
1321 if ( currentmap != entryp->aio_map ) {
1322 (void) vm_map_switch( oldmap );
1323 uthreadp->uu_aio_task = oldaiotask;
1324 }
1325
1326 /* we're done with the IO request so pop it off the active queue and */
1327 /* push it on the done queue */
1328 AIO_LOCK;
1329 TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1330 aio_anchor.aio_active_count--;
1331 entryp->procp->aio_active_count--;
1332 TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1333 aio_anchor.aio_done_count++;
1334 entryp->procp->aio_done_count++;
1335 entryp->flags |= AIO_COMPLETION;
1336
1337 /* remove our reference to the user land map. */
1338 if ( VM_MAP_NULL != entryp->aio_map ) {
1339 vm_map_t my_map;
1340
1341 my_map = entryp->aio_map;
1342 entryp->aio_map = VM_MAP_NULL;
1343 AIO_UNLOCK; /* must unlock before calling vm_map_deallocate() */
1344 vm_map_deallocate( my_map );
1345 }
1346 else {
1347 AIO_UNLOCK;
1348 }
1349
1350 do_aio_completion( entryp );
1351
1352 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1353 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1354 entryp->returnval, 0 );
1355
1356 AIO_LOCK;
1357 entryp->flags &= ~AIO_COMPLETION;
1358 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1359 vm_map_t my_map;
1360
1361 my_map = entryp->aio_map;
1362 entryp->aio_map = VM_MAP_NULL;
1363 AIO_UNLOCK;
1364 aio_free_request( entryp, my_map );
1365 }
1366 else
1367 AIO_UNLOCK;
1368 }
1369 } /* for ( ;; ) */
1370
1371 /* NOT REACHED */
1372
1373 } /* aio_work_thread */
1374
1375
1376 /*
1377 * aio_get_some_work - get the next async IO request that is ready to be executed.
1378 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1379 * IO requests at the time the aio_fsync call came in have completed.
1380 * NOTE - AIO_LOCK must be held by caller
1381 */
1382
1383 static aio_workq_entry *
1384 aio_get_some_work( void )
1385 {
1386 aio_workq_entry *entryp;
1387
1388 /* pop some work off the work queue and add to our active queue */
1389 for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1390 entryp != NULL;
1391 entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1392
1393 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1394 /* leave aio_fsync calls on the work queue if there are IO */
1395 /* requests on the active queue for the same file descriptor. */
1396 if ( aio_delay_fsync_request( entryp ) ) {
1397
1398 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1399 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1400 continue;
1401 }
1402 }
1403 break;
1404 }
1405
1406 if ( entryp != NULL ) {
1407 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1408 aio_anchor.aio_async_workq_count--;
1409 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1410 aio_anchor.aio_active_count++;
1411 entryp->procp->aio_active_count++;
1412 }
1413
1414 return( entryp );
1415
1416 } /* aio_get_some_work */
1417
1418
1419 /*
1420 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1421 * this time. Delay will happen when there are any active IOs for the same file
1422 * descriptor that were queued at time the aio_sync call was queued.
1423 * NOTE - AIO_LOCK must be held by caller
1424 */
1425 static boolean_t
1426 aio_delay_fsync_request( aio_workq_entry *entryp )
1427 {
1428 aio_workq_entry *my_entryp;
1429
1430 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1431 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1432 entryp->uaiocbp == my_entryp->fsyncp &&
1433 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1434 return( TRUE );
1435 }
1436 }
1437
1438 return( FALSE );
1439
1440 } /* aio_delay_fsync_request */
1441
1442
1443 /*
1444 * aio_queue_async_request - queue up an async IO request on our work queue then
1445 * wake up one of our worker threads to do the actual work. We get a reference
1446 * to our caller's user land map in order to keep it around while we are
1447 * processing the request.
1448 */
1449
1450 static int
1451 aio_queue_async_request( struct proc *procp, user_addr_t aiocbp, int kindOfIO )
1452 {
1453 aio_workq_entry *entryp;
1454 int result;
1455
1456 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1457 if ( entryp == NULL ) {
1458 result = EAGAIN;
1459 goto error_exit;
1460 }
1461 bzero( entryp, sizeof(*entryp) );
1462
1463 /* fill in the rest of the aio_workq_entry */
1464 entryp->procp = procp;
1465 entryp->uaiocbp = aiocbp;
1466 entryp->flags |= kindOfIO;
1467 entryp->aio_map = VM_MAP_NULL;
1468
1469 if ( !IS_64BIT_PROCESS(procp) ) {
1470 struct aiocb aiocb32;
1471
1472 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1473 if ( result == 0 )
1474 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1475 } else
1476 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1477
1478 if ( result != 0 ) {
1479 result = EAGAIN;
1480 goto error_exit;
1481 }
1482
1483 /* do some more validation on the aiocb and embedded file descriptor */
1484 result = aio_validate( entryp );
1485 if ( result != 0 )
1486 goto error_exit;
1487
1488 /* get a reference to the user land map in order to keep it around */
1489 entryp->aio_map = get_task_map( procp->task );
1490 vm_map_reference( entryp->aio_map );
1491
1492 AIO_LOCK;
1493
1494 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1495 AIO_UNLOCK;
1496 result = EAGAIN;
1497 goto error_exit;
1498 }
1499
1500 /* check our aio limits to throttle bad or rude user land behavior */
1501 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1502 aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1503 AIO_UNLOCK;
1504 result = EAGAIN;
1505 goto error_exit;
1506 }
1507
1508 /*
1509 * aio_fsync calls sync up all async IO requests queued at the time
1510 * the aio_fsync call was made. So we mark each currently queued async
1511 * IO with a matching file descriptor as must complete before we do the
1512 * fsync. We set the fsyncp field of each matching async IO
1513 * request with the aiocb pointer passed in on the aio_fsync call to
1514 * know which IOs must complete before we process the aio_fsync call.
1515 */
1516 if ( (kindOfIO & AIO_FSYNC) != 0 )
1517 aio_mark_requests( entryp );
1518
1519 /* queue up on our aio asynchronous work queue */
1520 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1521 aio_anchor.aio_async_workq_count++;
1522
1523 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1524 AIO_UNLOCK;
1525
1526 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1527 (int)procp, (int)aiocbp, 0, 0, 0 );
1528
1529 return( 0 );
1530
1531 error_exit:
1532 if ( entryp != NULL ) {
1533 /* this entry has not been queued up so no worries about unlocked */
1534 /* state and aio_map */
1535 aio_free_request( entryp, entryp->aio_map );
1536 }
1537
1538 return( result );
1539
1540 } /* aio_queue_async_request */
1541
1542
1543 /*
1544 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1545 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1546 * our caller. We get a reference to our caller's user land map in order to keep
1547 * it around while we are processing the request.
1548 * lio_listio calls behave differently at completion they do completion notification
1549 * when all async IO requests have completed. We use group_tag to tag IO requests
1550 * that behave in the delay notification manner.
1551 */
1552
1553 static int
1554 lio_create_async_entry( struct proc *procp, user_addr_t aiocbp,
1555 user_addr_t sigp, long group_tag,
1556 aio_workq_entry **entrypp )
1557 {
1558 aio_workq_entry *entryp;
1559 int result;
1560
1561 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1562 if ( entryp == NULL ) {
1563 result = EAGAIN;
1564 goto error_exit;
1565 }
1566 bzero( entryp, sizeof(*entryp) );
1567
1568 /* fill in the rest of the aio_workq_entry */
1569 entryp->procp = procp;
1570 entryp->uaiocbp = aiocbp;
1571 entryp->flags |= AIO_LIO;
1572 entryp->group_tag = group_tag;
1573 entryp->aio_map = VM_MAP_NULL;
1574
1575 if ( !IS_64BIT_PROCESS(procp) ) {
1576 struct aiocb aiocb32;
1577
1578 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1579 if ( result == 0 )
1580 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1581 } else
1582 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1583
1584 if ( result != 0 ) {
1585 result = EAGAIN;
1586 goto error_exit;
1587 }
1588
1589 /* look for lio_listio LIO_NOP requests and ignore them. */
1590 /* Not really an error, but we need to free our aio_workq_entry. */
1591 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1592 result = 0;
1593 goto error_exit;
1594 }
1595
1596 /* use sigevent passed in to lio_listio for each of our calls, but only */
1597 /* do completion notification after the last request completes. */
1598 if ( sigp != USER_ADDR_NULL ) {
1599 if ( !IS_64BIT_PROCESS(procp) ) {
1600 struct sigevent sigevent32;
1601
1602 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1603 if ( result == 0 ) {
1604 /* also need to munge aio_sigevent since it contains pointers */
1605 /* special case here. since we do not know if sigev_value is an */
1606 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
1607 /* means if we send this info back to user space we need to remember */
1608 /* sigev_value was not expanded for the 32-bit case. */
1609 /* NOTE - this does NOT affect us since we don't support sigev_value */
1610 /* yet in the aio context. */
1611 //LP64
1612 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1613 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1614 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1615 sigevent32.sigev_value.sival_int;
1616 entryp->aiocb.aio_sigevent.sigev_notify_function =
1617 CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1618 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1619 CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1620 }
1621 } else
1622 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1623
1624 if ( result != 0 ) {
1625 result = EAGAIN;
1626 goto error_exit;
1627 }
1628 }
1629
1630 /* do some more validation on the aiocb and embedded file descriptor */
1631 result = aio_validate( entryp );
1632 if ( result != 0 )
1633 goto error_exit;
1634
1635 /* get a reference to the user land map in order to keep it around */
1636 entryp->aio_map = get_task_map( procp->task );
1637 vm_map_reference( entryp->aio_map );
1638
1639 *entrypp = entryp;
1640 return( 0 );
1641
1642 error_exit:
1643 if ( entryp != NULL )
1644 zfree( aio_workq_zonep, entryp );
1645
1646 return( result );
1647
1648 } /* lio_create_async_entry */
1649
1650
1651 /*
1652 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1653 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1654 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1655 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1656 * NOTE - AIO_LOCK must be held by caller
1657 */
1658
1659 static void
1660 aio_mark_requests( aio_workq_entry *entryp )
1661 {
1662 aio_workq_entry *my_entryp;
1663
1664 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1665 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1666 my_entryp->fsyncp = entryp->uaiocbp;
1667 }
1668 }
1669
1670 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1671 if ( entryp->procp == my_entryp->procp &&
1672 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1673 my_entryp->fsyncp = entryp->uaiocbp;
1674 }
1675 }
1676
1677 } /* aio_mark_requests */
1678
1679
1680 /*
1681 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1682 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1683 * our caller.
1684 * lio_listio calls behave differently at completion they do completion notification
1685 * when all async IO requests have completed. We use group_tag to tag IO requests
1686 * that behave in the delay notification manner.
1687 */
1688
1689 static int
1690 lio_create_sync_entry( struct proc *procp, user_addr_t aiocbp,
1691 long group_tag, aio_workq_entry **entrypp )
1692 {
1693 aio_workq_entry *entryp;
1694 int result;
1695
1696 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1697 if ( entryp == NULL ) {
1698 result = EAGAIN;
1699 goto error_exit;
1700 }
1701 bzero( entryp, sizeof(*entryp) );
1702
1703 /* fill in the rest of the aio_workq_entry */
1704 entryp->procp = procp;
1705 entryp->uaiocbp = aiocbp;
1706 entryp->flags |= AIO_LIO;
1707 entryp->group_tag = group_tag;
1708 entryp->aio_map = VM_MAP_NULL;
1709
1710 if ( !IS_64BIT_PROCESS(procp) ) {
1711 struct aiocb aiocb32;
1712
1713 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1714 if ( result == 0 )
1715 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1716 } else
1717 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1718
1719 if ( result != 0 ) {
1720 result = EAGAIN;
1721 goto error_exit;
1722 }
1723
1724 /* look for lio_listio LIO_NOP requests and ignore them. */
1725 /* Not really an error, but we need to free our aio_workq_entry. */
1726 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1727 result = 0;
1728 goto error_exit;
1729 }
1730
1731 result = aio_validate( entryp );
1732 if ( result != 0 ) {
1733 goto error_exit;
1734 }
1735
1736 *entrypp = entryp;
1737 return( 0 );
1738
1739 error_exit:
1740 if ( entryp != NULL )
1741 zfree( aio_workq_zonep, entryp );
1742
1743 return( result );
1744
1745 } /* lio_create_sync_entry */
1746
1747
1748 /*
1749 * aio_free_request - remove our reference on the user land map and
1750 * free the work queue entry resources.
1751 * We are not holding the lock here thus aio_map is passed in and
1752 * zeroed while we did have the lock.
1753 */
1754
1755 static int
1756 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1757 {
1758 /* remove our reference to the user land map. */
1759 if ( VM_MAP_NULL != the_map ) {
1760 vm_map_deallocate( the_map );
1761 }
1762
1763 zfree( aio_workq_zonep, entryp );
1764
1765 return( 0 );
1766
1767 } /* aio_free_request */
1768
1769
1770 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1771 */
1772
1773 static int
1774 aio_validate( aio_workq_entry *entryp )
1775 {
1776 struct fileproc *fp;
1777 int flag;
1778 int result;
1779
1780 result = 0;
1781
1782 if ( (entryp->flags & AIO_LIO) != 0 ) {
1783 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1784 entryp->flags |= AIO_READ;
1785 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1786 entryp->flags |= AIO_WRITE;
1787 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1788 return( 0 );
1789 else
1790 return( EINVAL );
1791 }
1792
1793 flag = FREAD;
1794 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1795 flag = FWRITE;
1796 }
1797
1798 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1799 // LP64todo - does max value for aio_nbytes need to grow?
1800 if ( entryp->aiocb.aio_nbytes > INT_MAX ||
1801 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1802 entryp->aiocb.aio_offset < 0 )
1803 return( EINVAL );
1804 }
1805
1806 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1807 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1808 * sigev_notify_function, and sigev_notify_attributes are ignored.
1809 */
1810 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1811 int signum;
1812 /* make sure we have a valid signal number */
1813 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1814 if ( signum <= 0 || signum >= NSIG ||
1815 signum == SIGKILL || signum == SIGSTOP )
1816 return (EINVAL);
1817 }
1818 else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1819 return (EINVAL);
1820
1821 /* validate the file descriptor and that the file was opened
1822 * for the appropriate read / write access.
1823 */
1824 proc_fdlock(entryp->procp);
1825
1826 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1827 if ( result == 0 ) {
1828 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1829 /* we don't have read or write access */
1830 result = EBADF;
1831 }
1832 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1833 /* this is not a file */
1834 result = ESPIPE;
1835 } else
1836 fp->f_flags |= FP_AIOISSUED;
1837
1838 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1839 }
1840 else {
1841 result = EBADF;
1842 }
1843
1844 proc_fdunlock(entryp->procp);
1845
1846 return( result );
1847
1848 } /* aio_validate */
1849
1850
1851 /*
1852 * aio_get_process_count - runs through our queues that hold outstanding
1853 * async IO reqests and totals up number of requests for the given
1854 * process.
1855 * NOTE - caller must hold aio lock!
1856 */
1857
1858 static int
1859 aio_get_process_count( struct proc *procp )
1860 {
1861 aio_workq_entry *entryp;
1862 int count;
1863
1864 /* begin with count of completed async IO requests for this process */
1865 count = procp->aio_done_count;
1866
1867 /* add in count of active async IO requests for this process */
1868 count += procp->aio_active_count;
1869
1870 /* look for matches on our queue of asynchronous todo work */
1871 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1872 if ( procp == entryp->procp ) {
1873 count++;
1874 }
1875 }
1876
1877 /* look for matches on our queue of synchronous todo work */
1878 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1879 if ( procp == entryp->procp ) {
1880 count++;
1881 }
1882 }
1883
1884 return( count );
1885
1886 } /* aio_get_process_count */
1887
1888
1889 /*
1890 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1891 * NOTE - caller must hold aio lock!
1892 */
1893
1894 static int
1895 aio_get_all_queues_count( void )
1896 {
1897 int count;
1898
1899 count = aio_anchor.aio_async_workq_count;
1900 count += aio_anchor.lio_sync_workq_count;
1901 count += aio_anchor.aio_active_count;
1902 count += aio_anchor.aio_done_count;
1903
1904 return( count );
1905
1906 } /* aio_get_all_queues_count */
1907
1908
1909 /*
1910 * do_aio_completion. Handle async IO completion.
1911 */
1912
1913 static void
1914 do_aio_completion( aio_workq_entry *entryp )
1915 {
1916 /* signal user land process if appropriate */
1917 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1918 (entryp->flags & AIO_DISABLE) == 0 ) {
1919
1920 /*
1921 * if group_tag is non zero then make sure this is the last IO request
1922 * in the group before we signal.
1923 */
1924 if ( entryp->group_tag == 0 ||
1925 (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1926 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1927 (int)entryp->procp, (int)entryp->uaiocbp,
1928 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1929
1930 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1931 return;
1932 }
1933 }
1934
1935 /*
1936 * need to handle case where a process is trying to exit, exec, or close
1937 * and is currently waiting for active aio requests to complete. If
1938 * AIO_WAITING is set then we need to look to see if there are any
1939 * other requests in the active queue for this process. If there are
1940 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1941 * there are some still active then do nothing - we only want to wakeup
1942 * when all active aio requests for the process are complete.
1943 */
1944 if ( (entryp->flags & AIO_WAITING) != 0 ) {
1945 int active_requests;
1946
1947 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1948 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1949
1950 AIO_LOCK;
1951 active_requests = aio_active_requests_for_process( entryp->procp );
1952 //AIO_UNLOCK;
1953 if ( active_requests < 1 ) {
1954 /* no active aio requests for this process, continue exiting */
1955 wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1956
1957 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1958 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1959 }
1960 AIO_UNLOCK;
1961 return;
1962 }
1963
1964 /*
1965 * aio_suspend case when a signal was not requested. In that scenario we
1966 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1967 * NOTE - the assumption here is that this wakeup call is inexpensive.
1968 * we really only need to do this when an aio_suspend call is pending.
1969 * If we find the wakeup call should be avoided we could mark the
1970 * async IO requests given in the list provided by aio_suspend and only
1971 * call wakeup for them. If we do mark them we should unmark them after
1972 * the aio_suspend wakes up.
1973 */
1974 AIO_LOCK;
1975 wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1976 AIO_UNLOCK;
1977
1978 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1979 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1980
1981 return;
1982
1983 } /* do_aio_completion */
1984
1985
1986 /*
1987 * aio_last_group_io - checks to see if this is the last unfinished IO request
1988 * for the given group_tag. Returns TRUE if there are no other active IO
1989 * requests for this group or FALSE if the are active IO requests
1990 * NOTE - AIO_LOCK must be held by caller
1991 */
1992
1993 static boolean_t
1994 aio_last_group_io( aio_workq_entry *entryp )
1995 {
1996 aio_workq_entry *my_entryp;
1997
1998 /* look for matches on our queue of active async IO requests */
1999 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
2000 if ( my_entryp->group_tag == entryp->group_tag )
2001 return( FALSE );
2002 }
2003
2004 /* look for matches on our queue of asynchronous todo work */
2005 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2006 if ( my_entryp->group_tag == entryp->group_tag )
2007 return( FALSE );
2008 }
2009
2010 /* look for matches on our queue of synchronous todo work */
2011 TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2012 if ( my_entryp->group_tag == entryp->group_tag )
2013 return( FALSE );
2014 }
2015
2016 return( TRUE );
2017
2018 } /* aio_last_group_io */
2019
2020
2021 /*
2022 * do_aio_read
2023 */
2024 static int
2025 do_aio_read( aio_workq_entry *entryp )
2026 {
2027 struct fileproc *fp;
2028 int error;
2029
2030 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2031 return(error);
2032 if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2033 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2034 return(EBADF);
2035 }
2036 if ( fp != NULL ) {
2037 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes,
2038 entryp->aiocb.aio_buf,
2039 entryp->aiocb.aio_nbytes,
2040 entryp->aiocb.aio_offset, FOF_OFFSET,
2041 &entryp->returnval );
2042 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2043 }
2044 else {
2045 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2046 error = EBADF;
2047 }
2048
2049 return( error );
2050
2051 } /* do_aio_read */
2052
2053
2054 /*
2055 * do_aio_write
2056 */
2057 static int
2058 do_aio_write( aio_workq_entry *entryp )
2059 {
2060 struct fileproc *fp;
2061 int error;
2062
2063 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2064 return(error);
2065 if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2066 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2067 return(EBADF);
2068 }
2069 if ( fp != NULL ) {
2070 /* NB: tell dofilewrite the offset, and to use the proc cred */
2071 error = dofilewrite( entryp->procp,
2072 fp,
2073 entryp->aiocb.aio_fildes,
2074 entryp->aiocb.aio_buf,
2075 entryp->aiocb.aio_nbytes,
2076 entryp->aiocb.aio_offset,
2077 FOF_OFFSET | FOF_PCRED,
2078 &entryp->returnval);
2079
2080 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2081 }
2082 else {
2083 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2084 error = EBADF;
2085 }
2086
2087 return( error );
2088
2089 } /* do_aio_write */
2090
2091
2092 /*
2093 * aio_active_requests_for_process - return number of active async IO
2094 * requests for the given process.
2095 * NOTE - caller must hold aio lock!
2096 */
2097
2098 static int
2099 aio_active_requests_for_process( struct proc *procp )
2100 {
2101
2102 return( procp->aio_active_count );
2103
2104 } /* aio_active_requests_for_process */
2105
2106
2107 /*
2108 * do_aio_fsync
2109 */
2110 static int
2111 do_aio_fsync( aio_workq_entry *entryp )
2112 {
2113 struct vfs_context context;
2114 struct vnode *vp;
2115 struct fileproc *fp;
2116 int error;
2117
2118 /*
2119 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2120 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2121 * The following was shamelessly extracted from fsync() implementation.
2122 */
2123
2124 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2125 if ( error == 0 ) {
2126 if ( (error = vnode_getwithref(vp)) ) {
2127 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2128 entryp->returnval = -1;
2129 return(error);
2130 }
2131 context.vc_proc = entryp->procp;
2132 context.vc_ucred = fp->f_fglob->fg_cred;
2133
2134 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2135
2136 (void)vnode_put(vp);
2137
2138 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2139 }
2140 if ( error != 0 )
2141 entryp->returnval = -1;
2142
2143 return( error );
2144
2145 } /* do_aio_fsync */
2146
2147
2148 /*
2149 * is_already_queued - runs through our queues to see if the given
2150 * aiocbp / process is there. Returns TRUE if there is a match
2151 * on any of our aio queues.
2152 * NOTE - callers must hold aio lock!
2153 */
2154
2155 static boolean_t
2156 is_already_queued( struct proc *procp,
2157 user_addr_t aiocbp )
2158 {
2159 aio_workq_entry *entryp;
2160 boolean_t result;
2161
2162 result = FALSE;
2163
2164 /* look for matches on our queue of async IO requests that have completed */
2165 TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2166 if ( aiocbp == entryp->uaiocbp ) {
2167 result = TRUE;
2168 goto ExitThisRoutine;
2169 }
2170 }
2171
2172 /* look for matches on our queue of active async IO requests */
2173 TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2174 if ( aiocbp == entryp->uaiocbp ) {
2175 result = TRUE;
2176 goto ExitThisRoutine;
2177 }
2178 }
2179
2180 /* look for matches on our queue of asynchronous todo work */
2181 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2182 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2183 result = TRUE;
2184 goto ExitThisRoutine;
2185 }
2186 }
2187
2188 /* look for matches on our queue of synchronous todo work */
2189 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2190 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2191 result = TRUE;
2192 goto ExitThisRoutine;
2193 }
2194 }
2195
2196 ExitThisRoutine:
2197 return( result );
2198
2199 } /* is_already_queued */
2200
2201
2202 /*
2203 * aio initialization
2204 */
2205 __private_extern__ void
2206 aio_init( void )
2207 {
2208 int i;
2209
2210 aio_lock_grp_attr = lck_grp_attr_alloc_init();
2211 lck_grp_attr_setstat(aio_lock_grp_attr);
2212 aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2213 aio_lock_attr = lck_attr_alloc_init();
2214 //lck_attr_setdebug(aio_lock_attr);
2215
2216 aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2217
2218 AIO_LOCK;
2219 TAILQ_INIT( &aio_anchor.aio_async_workq );
2220 TAILQ_INIT( &aio_anchor.lio_sync_workq );
2221 aio_anchor.aio_async_workq_count = 0;
2222 aio_anchor.lio_sync_workq_count = 0;
2223 aio_anchor.aio_active_count = 0;
2224 aio_anchor.aio_done_count = 0;
2225 AIO_UNLOCK;
2226
2227 i = sizeof( aio_workq_entry );
2228 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2229
2230 _aio_create_worker_threads( aio_worker_threads );
2231
2232 return;
2233
2234 } /* aio_init */
2235
2236
2237 /*
2238 * aio worker threads created here.
2239 */
2240 __private_extern__ void
2241 _aio_create_worker_threads( int num )
2242 {
2243 int i;
2244
2245 /* create some worker threads to handle the async IO requests */
2246 for ( i = 0; i < num; i++ ) {
2247 thread_t myThread;
2248
2249 myThread = kernel_thread( kernel_task, aio_work_thread );
2250 if ( THREAD_NULL == myThread ) {
2251 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2252 }
2253 }
2254
2255 return;
2256
2257 } /* _aio_create_worker_threads */
2258
2259 /*
2260 * Return the current activation utask
2261 */
2262 task_t
2263 get_aiotask(void)
2264 {
2265 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2266 }
2267
2268
2269 /*
2270 * In the case of an aiocb from a
2271 * 32-bit process we need to expand some longs and pointers to the correct
2272 * sizes in order to let downstream code always work on the same type of
2273 * aiocb (in our case that is a user_aiocb)
2274 */
2275 static void
2276 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2277 {
2278 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2279 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2280 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2281 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2282 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2283 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2284
2285 /* special case here. since we do not know if sigev_value is an */
2286 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2287 /* means if we send this info back to user space we need to remember */
2288 /* sigev_value was not expanded for the 32-bit case. */
2289 /* NOTE - this does NOT affect us since we don't support sigev_value */
2290 /* yet in the aio context. */
2291 //LP64
2292 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2293 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2294 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2295 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2296 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2297 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2298 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2299 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2300 }