]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_aio.c
xnu-1228.15.4.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
1 /*
2 * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 /*
31 * todo:
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
38 */
39
40
41 /*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
57 #include <sys/user.h>
58
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
61
62 #include <machine/limits.h>
63
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/zalloc.h>
67 #include <kern/task.h>
68 #include <kern/sched_prim.h>
69
70 #include <vm/vm_map.h>
71
72 #include <sys/kdebug.h>
73 #define AIO_work_queued 1
74 #define AIO_worker_wake 2
75 #define AIO_completion_sig 3
76 #define AIO_completion_cleanup_wait 4
77 #define AIO_completion_cleanup_wake 5
78 #define AIO_completion_suspend_wake 6
79 #define AIO_fsync_delay 7
80 #define AIO_cancel 10
81 #define AIO_cancel_async_workq 11
82 #define AIO_cancel_sync_workq 12
83 #define AIO_cancel_activeq 13
84 #define AIO_cancel_doneq 14
85 #define AIO_fsync 20
86 #define AIO_read 30
87 #define AIO_write 40
88 #define AIO_listio 50
89 #define AIO_error 60
90 #define AIO_error_val 61
91 #define AIO_error_activeq 62
92 #define AIO_error_workq 63
93 #define AIO_return 70
94 #define AIO_return_val 71
95 #define AIO_return_activeq 72
96 #define AIO_return_workq 73
97 #define AIO_exec 80
98 #define AIO_exit 90
99 #define AIO_exit_sleep 91
100 #define AIO_close 100
101 #define AIO_close_sleep 101
102 #define AIO_suspend 110
103 #define AIO_suspend_sleep 111
104 #define AIO_worker_thread 120
105
106 #if 0
107 #undef KERNEL_DEBUG
108 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
109 #endif
110
111 /*
112 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
113 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
114 * (proc.aio_activeq) when one of our worker threads start the IO.
115 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
116 * when the IO request completes. The request remains on aio_doneq until
117 * user process calls aio_return or the process exits, either way that is our
118 * trigger to release aio resources.
119 */
120 struct aio_anchor_cb
121 {
122 int aio_async_workq_count; /* entries on aio_async_workq */
123 int lio_sync_workq_count; /* entries on lio_sync_workq */
124 int aio_active_count; /* entries on all active queues (proc.aio_activeq) */
125 int aio_done_count; /* entries on all done queues (proc.aio_doneq) */
126 TAILQ_HEAD( , aio_workq_entry ) aio_async_workq;
127 TAILQ_HEAD( , aio_workq_entry ) lio_sync_workq;
128 };
129 typedef struct aio_anchor_cb aio_anchor_cb;
130
131
132 /*
133 * Notes on aio sleep / wake channels.
134 * We currently pick a couple fields within the proc structure that will allow
135 * us sleep channels that currently do not collide with any other kernel routines.
136 * At this time, for binary compatibility reasons, we cannot create new proc fields.
137 */
138 #define AIO_SUSPEND_SLEEP_CHAN aio_active_count
139 #define AIO_CLEANUP_SLEEP_CHAN aio_done_count
140
141
142 /*
143 * aysnc IO locking macros used to protect critical sections.
144 */
145 #define AIO_LOCK lck_mtx_lock(aio_lock)
146 #define AIO_UNLOCK lck_mtx_unlock(aio_lock)
147
148
149 /*
150 * LOCAL PROTOTYPES
151 */
152 static int aio_active_requests_for_process(proc_t procp );
153 static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
154 static int aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
155 static int aio_get_all_queues_count( void );
156 static int aio_get_process_count(proc_t procp );
157 static aio_workq_entry * aio_get_some_work( void );
158 static boolean_t aio_last_group_io( aio_workq_entry *entryp );
159 static void aio_mark_requests( aio_workq_entry *entryp );
160 static int aio_queue_async_request(proc_t procp,
161 user_addr_t aiocbp,
162 int kindOfIO );
163 static int aio_validate( aio_workq_entry *entryp );
164 static void aio_work_thread( void );
165 static int do_aio_cancel(proc_t p,
166 int fd,
167 user_addr_t aiocbp,
168 boolean_t wait_for_completion,
169 boolean_t disable_notification );
170 static void do_aio_completion( aio_workq_entry *entryp );
171 static int do_aio_fsync( aio_workq_entry *entryp );
172 static int do_aio_read( aio_workq_entry *entryp );
173 static int do_aio_write( aio_workq_entry *entryp );
174 static void do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
175 static boolean_t is_already_queued(proc_t procp,
176 user_addr_t aiocbp );
177 static int lio_create_async_entry(proc_t procp,
178 user_addr_t aiocbp,
179 user_addr_t sigp,
180 long group_tag,
181 aio_workq_entry **entrypp );
182 static int lio_create_sync_entry(proc_t procp,
183 user_addr_t aiocbp,
184 long group_tag,
185 aio_workq_entry **entrypp );
186
187
188 /*
189 * EXTERNAL PROTOTYPES
190 */
191
192 /* in ...bsd/kern/sys_generic.c */
193 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
194 user_addr_t bufp, user_size_t nbyte,
195 off_t offset, int flags, user_ssize_t *retval );
196 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
197 user_addr_t bufp, user_size_t nbyte, off_t offset,
198 int flags, user_ssize_t *retval );
199
200 /*
201 * aio external global variables.
202 */
203 extern int aio_max_requests; /* AIO_MAX - configurable */
204 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
205 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
206
207
208 /*
209 * aio static variables.
210 */
211 static aio_anchor_cb aio_anchor;
212 static lck_mtx_t * aio_lock;
213 static lck_grp_t * aio_lock_grp;
214 static lck_attr_t * aio_lock_attr;
215 static lck_grp_attr_t * aio_lock_grp_attr;
216 static struct zone *aio_workq_zonep;
217
218
219
220
221 /*
222 * aio_cancel - attempt to cancel one or more async IO requests currently
223 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
224 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
225 * is NULL then all outstanding async IO request for the given file
226 * descriptor are cancelled (if possible).
227 */
228
229 int
230 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
231 {
232 struct user_aiocb my_aiocb;
233 int result;
234
235 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
236 (int)p, (int)uap->aiocbp, 0, 0, 0 );
237
238 /* quick check to see if there are any async IO requests queued up */
239 AIO_LOCK;
240 result = aio_get_all_queues_count( );
241 AIO_UNLOCK;
242 if ( result < 1 ) {
243 result = 0;
244 *retval = AIO_ALLDONE;
245 goto ExitRoutine;
246 }
247
248 *retval = -1;
249 if ( uap->aiocbp != USER_ADDR_NULL ) {
250 if ( !IS_64BIT_PROCESS(p) ) {
251 struct aiocb aiocb32;
252
253 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
254 if ( result == 0 )
255 do_munge_aiocb( &aiocb32, &my_aiocb );
256 } else
257 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
258
259 if ( result != 0 ) {
260 result = EAGAIN;
261 goto ExitRoutine;
262 }
263
264 /* NOTE - POSIX standard says a mismatch between the file */
265 /* descriptor passed in and the file descriptor embedded in */
266 /* the aiocb causes unspecified results. We return EBADF in */
267 /* that situation. */
268 if ( uap->fd != my_aiocb.aio_fildes ) {
269 result = EBADF;
270 goto ExitRoutine;
271 }
272 }
273 result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
274
275 if ( result != -1 ) {
276 *retval = result;
277 result = 0;
278 goto ExitRoutine;
279 }
280
281 result = EBADF;
282
283 ExitRoutine:
284 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
285 (int)p, (int)uap->aiocbp, result, 0, 0 );
286
287 return( result );
288
289 } /* aio_cancel */
290
291
292 /*
293 * _aio_close - internal function used to clean up async IO requests for
294 * a file descriptor that is closing.
295 * THIS MAY BLOCK.
296 */
297
298 __private_extern__ void
299 _aio_close(proc_t p, int fd )
300 {
301 int error, count;
302
303 /* quick check to see if there are any async IO requests queued up */
304 AIO_LOCK;
305 count = aio_get_all_queues_count( );
306 AIO_UNLOCK;
307 if ( count < 1 )
308 return;
309
310 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
311 (int)p, fd, 0, 0, 0 );
312
313 /* cancel all async IO requests on our todo queues for this file descriptor */
314 error = do_aio_cancel( p, fd, 0, TRUE, FALSE );
315 if ( error == AIO_NOTCANCELED ) {
316 /*
317 * AIO_NOTCANCELED is returned when we find an aio request for this process
318 * and file descriptor on the active async IO queue. Active requests cannot
319 * be cancelled so we must wait for them to complete. We will get a special
320 * wake up call on our channel used to sleep for ALL active requests to
321 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
322 * when we must wait for all active aio requests.
323 */
324
325 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
326 (int)p, fd, 0, 0, 0 );
327
328 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
329 }
330
331 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
332 (int)p, fd, 0, 0, 0 );
333
334 return;
335
336 } /* _aio_close */
337
338
339 /*
340 * aio_error - return the error status associated with the async IO
341 * request referred to by uap->aiocbp. The error status is the errno
342 * value that would be set by the corresponding IO request (read, wrtie,
343 * fdatasync, or sync).
344 */
345
346 int
347 aio_error(proc_t p, struct aio_error_args *uap, int *retval )
348 {
349 aio_workq_entry *entryp;
350 int error;
351
352 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
353 (int)p, (int)uap->aiocbp, 0, 0, 0 );
354
355 AIO_LOCK;
356
357 /* quick check to see if there are any async IO requests queued up */
358 if ( aio_get_all_queues_count( ) < 1 ) {
359 error = EINVAL;
360 goto ExitRoutine;
361 }
362
363 /* look for a match on our queue of async IO requests that have completed */
364 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
365 if ( entryp->uaiocbp == uap->aiocbp ) {
366 *retval = entryp->errorval;
367 error = 0;
368 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
369 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
370 goto ExitRoutine;
371 }
372 }
373
374 /* look for a match on our queue of active async IO requests */
375 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
376 if ( entryp->uaiocbp == uap->aiocbp ) {
377 *retval = EINPROGRESS;
378 error = 0;
379 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
380 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
381 goto ExitRoutine;
382 }
383 }
384
385 /* look for a match on our queue of todo work */
386 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
387 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
388 *retval = EINPROGRESS;
389 error = 0;
390 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
391 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
392 goto ExitRoutine;
393 }
394 }
395 error = EINVAL;
396
397 ExitRoutine:
398 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
399 (int)p, (int)uap->aiocbp, error, 0, 0 );
400 AIO_UNLOCK;
401
402 return( error );
403
404 } /* aio_error */
405
406
407 /*
408 * aio_fsync - asynchronously force all IO operations associated
409 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
410 * queued at the time of the call to the synchronized completion state.
411 * NOTE - we do not support op O_DSYNC at this point since we do not support the
412 * fdatasync() call.
413 */
414
415 int
416 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
417 {
418 int error;
419 int fsync_kind;
420
421 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
422 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
423
424 *retval = 0;
425 /* 0 := O_SYNC for binary backward compatibility with Panther */
426 if (uap->op == O_SYNC || uap->op == 0)
427 fsync_kind = AIO_FSYNC;
428 #if 0 // we don't support fdatasync() call yet
429 else if ( uap->op == O_DSYNC )
430 fsync_kind = AIO_DSYNC;
431 #endif
432 else {
433 *retval = -1;
434 error = EINVAL;
435 goto ExitRoutine;
436 }
437
438 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
439 if ( error != 0 )
440 *retval = -1;
441
442 ExitRoutine:
443 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
444 (int)p, (int)uap->aiocbp, error, 0, 0 );
445
446 return( error );
447
448 } /* aio_fsync */
449
450
451 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
452 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
453 * (uap->aiocbp->aio_buf).
454 */
455
456 int
457 aio_read(proc_t p, struct aio_read_args *uap, int *retval )
458 {
459 int error;
460
461 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
462 (int)p, (int)uap->aiocbp, 0, 0, 0 );
463
464 *retval = 0;
465
466 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
467 if ( error != 0 )
468 *retval = -1;
469
470 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
471 (int)p, (int)uap->aiocbp, error, 0, 0 );
472
473 return( error );
474
475 } /* aio_read */
476
477
478 /*
479 * aio_return - return the return status associated with the async IO
480 * request referred to by uap->aiocbp. The return status is the value
481 * that would be returned by corresponding IO request (read, wrtie,
482 * fdatasync, or sync). This is where we release kernel resources
483 * held for async IO call associated with the given aiocb pointer.
484 */
485
486 int
487 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
488 {
489 aio_workq_entry *entryp;
490 int error;
491 boolean_t lock_held;
492
493 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
494 (int)p, (int)uap->aiocbp, 0, 0, 0 );
495
496 AIO_LOCK;
497 lock_held = TRUE;
498 *retval = 0;
499
500 /* quick check to see if there are any async IO requests queued up */
501 if ( aio_get_all_queues_count( ) < 1 ) {
502 error = EINVAL;
503 goto ExitRoutine;
504 }
505
506 /* look for a match on our queue of async IO requests that have completed */
507 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
508 if ( entryp->uaiocbp == uap->aiocbp ) {
509 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
510 aio_anchor.aio_done_count--;
511 p->aio_done_count--;
512
513 *retval = entryp->returnval;
514
515 /* we cannot free requests that are still completing */
516 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
517 vm_map_t my_map;
518
519 my_map = entryp->aio_map;
520 entryp->aio_map = VM_MAP_NULL;
521 AIO_UNLOCK;
522 lock_held = FALSE;
523 aio_free_request( entryp, my_map );
524 }
525 else
526 /* tell completion code to free this request */
527 entryp->flags |= AIO_DO_FREE;
528 error = 0;
529 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
530 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
531 goto ExitRoutine;
532 }
533 }
534
535 /* look for a match on our queue of active async IO requests */
536 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
537 if ( entryp->uaiocbp == uap->aiocbp ) {
538 error = EINPROGRESS;
539 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
540 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
541 goto ExitRoutine;
542 }
543 }
544
545 /* look for a match on our queue of todo work */
546 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
547 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
548 error = EINPROGRESS;
549 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
550 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
551 goto ExitRoutine;
552 }
553 }
554 error = EINVAL;
555
556 ExitRoutine:
557 if ( lock_held )
558 AIO_UNLOCK;
559 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
560 (int)p, (int)uap->aiocbp, error, 0, 0 );
561
562 return( error );
563
564 } /* aio_return */
565
566
567 /*
568 * _aio_exec - internal function used to clean up async IO requests for
569 * a process that is going away due to exec(). We cancel any async IOs
570 * we can and wait for those already active. We also disable signaling
571 * for cancelled or active aio requests that complete.
572 * This routine MAY block!
573 */
574
575 __private_extern__ void
576 _aio_exec(proc_t p )
577 {
578
579 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
580 (int)p, 0, 0, 0, 0 );
581
582 _aio_exit( p );
583
584 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
585 (int)p, 0, 0, 0, 0 );
586
587 return;
588
589 } /* _aio_exec */
590
591
592 /*
593 * _aio_exit - internal function used to clean up async IO requests for
594 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
595 * we can and wait for those already active. We also disable signaling
596 * for cancelled or active aio requests that complete. This routine MAY block!
597 */
598
599 __private_extern__ void
600 _aio_exit(proc_t p )
601 {
602 int error, count;
603 aio_workq_entry *entryp;
604
605 /* quick check to see if there are any async IO requests queued up */
606 AIO_LOCK;
607 count = aio_get_all_queues_count( );
608 AIO_UNLOCK;
609 if ( count < 1 ) {
610 return;
611 }
612
613 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
614 (int)p, 0, 0, 0, 0 );
615
616 /*
617 * cancel async IO requests on the todo work queue and wait for those
618 * already active to complete.
619 */
620 error = do_aio_cancel( p, 0, 0, TRUE, TRUE );
621 if ( error == AIO_NOTCANCELED ) {
622 /*
623 * AIO_NOTCANCELED is returned when we find an aio request for this process
624 * on the active async IO queue. Active requests cannot be cancelled so we
625 * must wait for them to complete. We will get a special wake up call on
626 * our channel used to sleep for ALL active requests to complete. This sleep
627 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
628 * active aio requests.
629 */
630
631 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
632 (int)p, 0, 0, 0, 0 );
633
634 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
635 }
636
637 /* release all aio resources used by this process */
638 AIO_LOCK;
639 entryp = TAILQ_FIRST( &p->aio_doneq );
640 while ( entryp != NULL ) {
641 aio_workq_entry *next_entryp;
642
643 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
644 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
645 aio_anchor.aio_done_count--;
646 p->aio_done_count--;
647
648 /* we cannot free requests that are still completing */
649 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
650 vm_map_t my_map;
651
652 my_map = entryp->aio_map;
653 entryp->aio_map = VM_MAP_NULL;
654 AIO_UNLOCK;
655 aio_free_request( entryp, my_map );
656
657 /* need to start over since aio_doneq may have been */
658 /* changed while we were away. */
659 AIO_LOCK;
660 entryp = TAILQ_FIRST( &p->aio_doneq );
661 continue;
662 }
663 else
664 /* tell completion code to free this request */
665 entryp->flags |= AIO_DO_FREE;
666 entryp = next_entryp;
667 }
668 AIO_UNLOCK;
669
670 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
671 (int)p, 0, 0, 0, 0 );
672
673 return;
674
675 } /* _aio_exit */
676
677
678 /*
679 * do_aio_cancel - cancel async IO requests (if possible). We get called by
680 * aio_cancel, close, and at exit.
681 * There are three modes of operation: 1) cancel all async IOs for a process -
682 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
683 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
684 * aiocbp.
685 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
686 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
687 * target async IO requests, and AIO_ALLDONE if all target async IO requests
688 * were already complete.
689 * WARNING - do not deference aiocbp in this routine, it may point to user
690 * land data that has not been copied in (when called from aio_cancel() )
691 */
692
693 static int
694 do_aio_cancel(proc_t p, int fd, user_addr_t aiocbp,
695 boolean_t wait_for_completion, boolean_t disable_notification )
696 {
697 aio_workq_entry *entryp;
698 int result;
699
700 result = -1;
701
702 /* look for a match on our queue of async todo work. */
703 AIO_LOCK;
704 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
705 while ( entryp != NULL ) {
706 aio_workq_entry *next_entryp;
707
708 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
709 if ( p == entryp->procp ) {
710 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
711 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
712 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
713 /* we found a match so we remove the entry from the */
714 /* todo work queue and place it on the done queue */
715 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
716 aio_anchor.aio_async_workq_count--;
717 entryp->errorval = ECANCELED;
718 entryp->returnval = -1;
719 if ( disable_notification )
720 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
721 result = AIO_CANCELED;
722
723 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
724 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
725
726 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
727 aio_anchor.aio_done_count++;
728 p->aio_done_count++;
729 entryp->flags |= AIO_COMPLETION;
730 AIO_UNLOCK;
731
732 /* do completion processing for this request */
733 do_aio_completion( entryp );
734
735 AIO_LOCK;
736 entryp->flags &= ~AIO_COMPLETION;
737 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
738 vm_map_t my_map;
739
740 my_map = entryp->aio_map;
741 entryp->aio_map = VM_MAP_NULL;
742 AIO_UNLOCK;
743 aio_free_request( entryp, my_map );
744 }
745 else
746 AIO_UNLOCK;
747
748 if ( aiocbp != USER_ADDR_NULL ) {
749 return( result );
750 }
751
752 /* need to start over since aio_async_workq may have been */
753 /* changed while we were away doing completion processing. */
754 AIO_LOCK;
755 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
756 continue;
757 }
758 }
759 entryp = next_entryp;
760 } /* while... */
761
762 /*
763 * look for a match on our queue of synchronous todo work. This will
764 * be a rare occurrence but could happen if a process is terminated while
765 * processing a lio_listio call.
766 */
767 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
768 while ( entryp != NULL ) {
769 aio_workq_entry *next_entryp;
770
771 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
772 if ( p == entryp->procp ) {
773 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
774 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
775 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
776 /* we found a match so we remove the entry from the */
777 /* todo work queue and place it on the done queue */
778 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
779 aio_anchor.lio_sync_workq_count--;
780 entryp->errorval = ECANCELED;
781 entryp->returnval = -1;
782 if ( disable_notification )
783 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
784 result = AIO_CANCELED;
785
786 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
787 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
788
789 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
790 aio_anchor.aio_done_count++;
791 p->aio_done_count++;
792 if ( aiocbp != USER_ADDR_NULL ) {
793 AIO_UNLOCK;
794 return( result );
795 }
796 }
797 }
798 entryp = next_entryp;
799 } /* while... */
800
801 /*
802 * look for a match on our queue of active async IO requests and
803 * return AIO_NOTCANCELED result.
804 */
805 TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
806 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
807 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
808 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
809 result = AIO_NOTCANCELED;
810
811 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
812 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
813
814 if ( wait_for_completion )
815 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
816 if ( disable_notification )
817 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
818 if ( aiocbp != USER_ADDR_NULL ) {
819 AIO_UNLOCK;
820 return( result );
821 }
822 }
823 }
824
825 /*
826 * if we didn't find any matches on the todo or active queues then look for a
827 * match on our queue of async IO requests that have completed and if found
828 * return AIO_ALLDONE result.
829 */
830 if ( result == -1 ) {
831 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
832 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
833 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
834 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
835 result = AIO_ALLDONE;
836
837 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
838 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
839
840 if ( aiocbp != USER_ADDR_NULL ) {
841 AIO_UNLOCK;
842 return( result );
843 }
844 }
845 }
846 }
847 AIO_UNLOCK;
848
849 return( result );
850
851 } /* do_aio_cancel */
852
853
854 /*
855 * aio_suspend - suspend the calling thread until at least one of the async
856 * IO operations referenced by uap->aiocblist has completed, until a signal
857 * interrupts the function, or uap->timeoutp time interval (optional) has
858 * passed.
859 * Returns 0 if one or more async IOs have completed else -1 and errno is
860 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
861 * woke us up.
862 */
863 int
864 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
865 {
866 __pthread_testcancel(1);
867 return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
868 }
869
870
871 int
872 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
873 {
874 int error;
875 int i, count;
876 uint64_t abstime;
877 struct user_timespec ts;
878 aio_workq_entry *entryp;
879 user_addr_t *aiocbpp;
880
881 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
882 (int)p, uap->nent, 0, 0, 0 );
883
884 *retval = -1;
885 abstime = 0;
886 aiocbpp = NULL;
887
888 /* quick check to see if there are any async IO requests queued up */
889 AIO_LOCK;
890 count = aio_get_all_queues_count( );
891 AIO_UNLOCK;
892 if ( count < 1 ) {
893 error = EINVAL;
894 goto ExitThisRoutine;
895 }
896
897 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
898 error = EINVAL;
899 goto ExitThisRoutine;
900 }
901
902 if ( uap->timeoutp != USER_ADDR_NULL ) {
903 if ( proc_is64bit(p) ) {
904 error = copyin( uap->timeoutp, &ts, sizeof(ts) );
905 }
906 else {
907 struct timespec temp;
908 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
909 if ( error == 0 ) {
910 ts.tv_sec = temp.tv_sec;
911 ts.tv_nsec = temp.tv_nsec;
912 }
913 }
914 if ( error != 0 ) {
915 error = EAGAIN;
916 goto ExitThisRoutine;
917 }
918
919 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
920 error = EINVAL;
921 goto ExitThisRoutine;
922 }
923
924 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
925 &abstime );
926 clock_absolutetime_interval_to_deadline( abstime, &abstime );
927 }
928
929 /* we reserve enough space for largest possible pointer size */
930 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
931 if ( aiocbpp == NULL ) {
932 error = EAGAIN;
933 goto ExitThisRoutine;
934 }
935
936 /* copyin our aiocb pointers from list */
937 error = copyin( uap->aiocblist, aiocbpp,
938 proc_is64bit(p) ? (uap->nent * sizeof(user_addr_t))
939 : (uap->nent * sizeof(uintptr_t)) );
940 if ( error != 0 ) {
941 error = EAGAIN;
942 goto ExitThisRoutine;
943 }
944
945 /* we depend on a list of user_addr_t's so we need to munge and expand */
946 /* when these pointers came from a 32-bit process */
947 if ( !proc_is64bit(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
948 /* position to the last entry and work back from there */
949 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
950 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
951 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
952 *my_addrp = (user_addr_t) (*my_ptrp);
953 }
954 }
955
956 /* check list of aio requests to see if any have completed */
957 check_for_our_aiocbp:
958 AIO_LOCK;
959 for ( i = 0; i < uap->nent; i++ ) {
960 user_addr_t aiocbp;
961
962 /* NULL elements are legal so check for 'em */
963 aiocbp = *(aiocbpp + i);
964 if ( aiocbp == USER_ADDR_NULL )
965 continue;
966
967 /* return immediately if any aio request in the list is done */
968 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
969 if ( entryp->uaiocbp == aiocbp ) {
970 *retval = 0;
971 error = 0;
972 AIO_UNLOCK;
973 goto ExitThisRoutine;
974 }
975 }
976 } /* for ( ; i < uap->nent; ) */
977
978 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
979 (int)p, uap->nent, 0, 0, 0 );
980
981 /*
982 * wait for an async IO to complete or a signal fires or timeout expires.
983 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
984 * interrupts us. If an async IO completes before a signal fires or our
985 * timeout expires, we get a wakeup call from aio_work_thread().
986 */
987 assert_wait_deadline( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE, abstime );
988 AIO_UNLOCK;
989
990 error = thread_block( THREAD_CONTINUE_NULL );
991
992 if ( error == THREAD_AWAKENED ) {
993 /*
994 * got our wakeup call from aio_work_thread().
995 * Since we can get a wakeup on this channel from another thread in the
996 * same process we head back up to make sure this is for the correct aiocbp.
997 * If it is the correct aiocbp we will return from where we do the check
998 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
999 * else we will fall out and just sleep again.
1000 */
1001 goto check_for_our_aiocbp;
1002 }
1003 else if ( error == THREAD_TIMED_OUT ) {
1004 /* our timeout expired */
1005 error = EAGAIN;
1006 }
1007 else {
1008 /* we were interrupted */
1009 error = EINTR;
1010 }
1011
1012 ExitThisRoutine:
1013 if ( aiocbpp != NULL )
1014 FREE( aiocbpp, M_TEMP );
1015
1016 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1017 (int)p, uap->nent, error, 0, 0 );
1018
1019 return( error );
1020
1021 } /* aio_suspend */
1022
1023
1024 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1025 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1026 * (uap->aiocbp->aio_buf).
1027 */
1028
1029 int
1030 aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1031 {
1032 int error;
1033
1034 *retval = 0;
1035
1036 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1037 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1038
1039 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1040 if ( error != 0 )
1041 *retval = -1;
1042
1043 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1044 (int)p, (int)uap->aiocbp, error, 0, 0 );
1045
1046 return( error );
1047
1048 } /* aio_write */
1049
1050
1051 /*
1052 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1053 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1054 * The caller gets error and return status for each aiocb in the list via aio_error
1055 * and aio_return. We must keep completed requests until released by the
1056 * aio_return call.
1057 */
1058
1059 int
1060 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1061 {
1062 int i;
1063 int call_result;
1064 int result;
1065 long group_tag;
1066 aio_workq_entry * *entryp_listp;
1067 user_addr_t *aiocbpp;
1068
1069 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1070 (int)p, uap->nent, uap->mode, 0, 0 );
1071
1072 entryp_listp = NULL;
1073 aiocbpp = NULL;
1074 call_result = -1;
1075 *retval = -1;
1076 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1077 call_result = EINVAL;
1078 goto ExitRoutine;
1079 }
1080
1081 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1082 call_result = EINVAL;
1083 goto ExitRoutine;
1084 }
1085
1086 /*
1087 * we use group_tag to mark IO requests for delayed completion processing
1088 * which means we wait until all IO requests in the group have completed
1089 * before we either return to the caller when mode is LIO_WAIT or signal
1090 * user when mode is LIO_NOWAIT.
1091 */
1092 group_tag = random();
1093
1094 /*
1095 * allocate a list of aio_workq_entry pointers that we will use to queue
1096 * up all our requests at once while holding our lock.
1097 */
1098 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1099 if ( entryp_listp == NULL ) {
1100 call_result = EAGAIN;
1101 goto ExitRoutine;
1102 }
1103
1104 /* we reserve enough space for largest possible pointer size */
1105 MALLOC( aiocbpp, user_addr_t *, (uap->nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1106 if ( aiocbpp == NULL ) {
1107 call_result = EAGAIN;
1108 goto ExitRoutine;
1109 }
1110
1111 /* copyin our aiocb pointers from list */
1112 result = copyin( uap->aiocblist, aiocbpp,
1113 IS_64BIT_PROCESS(p) ? (uap->nent * sizeof(user_addr_t))
1114 : (uap->nent * sizeof(uintptr_t)) );
1115 if ( result != 0 ) {
1116 call_result = EAGAIN;
1117 goto ExitRoutine;
1118 }
1119
1120 /* we depend on a list of user_addr_t's so we need to munge and expand */
1121 /* when these pointers came from a 32-bit process */
1122 if ( !IS_64BIT_PROCESS(p) && sizeof(uintptr_t) < sizeof(user_addr_t) ) {
1123 /* position to the last entry and work back from there */
1124 uintptr_t *my_ptrp = ((uintptr_t *)aiocbpp) + (uap->nent - 1);
1125 user_addr_t *my_addrp = aiocbpp + (uap->nent - 1);
1126 for (i = 0; i < uap->nent; i++, my_ptrp--, my_addrp--) {
1127 *my_addrp = (user_addr_t) (*my_ptrp);
1128 }
1129 }
1130
1131 /* process list of aio requests */
1132 for ( i = 0; i < uap->nent; i++ ) {
1133 user_addr_t my_aiocbp;
1134
1135 *(entryp_listp + i) = NULL;
1136 my_aiocbp = *(aiocbpp + i);
1137
1138 /* NULL elements are legal so check for 'em */
1139 if ( my_aiocbp == USER_ADDR_NULL )
1140 continue;
1141
1142 if ( uap->mode == LIO_NOWAIT )
1143 result = lio_create_async_entry( p, my_aiocbp, uap->sigp,
1144 group_tag, (entryp_listp + i) );
1145 else
1146 result = lio_create_sync_entry( p, my_aiocbp, group_tag,
1147 (entryp_listp + i) );
1148
1149 if ( result != 0 && call_result == -1 )
1150 call_result = result;
1151 }
1152
1153 /*
1154 * we need to protect this section since we do not want any of these grouped
1155 * IO requests to begin until we have them all on the queue.
1156 */
1157 AIO_LOCK;
1158 for ( i = 0; i < uap->nent; i++ ) {
1159 aio_workq_entry *entryp;
1160
1161 /* NULL elements are legal so check for 'em */
1162 entryp = *(entryp_listp + i);
1163 if ( entryp == NULL )
1164 continue;
1165
1166 /* check our aio limits to throttle bad or rude user land behavior */
1167 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1168 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1169 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1170 vm_map_t my_map;
1171
1172 my_map = entryp->aio_map;
1173 entryp->aio_map = VM_MAP_NULL;
1174 if ( call_result == -1 )
1175 call_result = EAGAIN;
1176 AIO_UNLOCK;
1177 aio_free_request( entryp, my_map );
1178 AIO_LOCK;
1179 continue;
1180 }
1181
1182 /* place the request on the appropriate queue */
1183 if ( uap->mode == LIO_NOWAIT ) {
1184 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1185 aio_anchor.aio_async_workq_count++;
1186
1187 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1188 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1189 }
1190 else {
1191 TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1192 aio_anchor.lio_sync_workq_count++;
1193 }
1194 }
1195
1196 if ( uap->mode == LIO_NOWAIT ) {
1197 /* caller does not want to wait so we'll fire off a worker thread and return */
1198 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1199 }
1200 else {
1201 aio_workq_entry *entryp;
1202 int error;
1203
1204 /*
1205 * mode is LIO_WAIT - handle the IO requests now.
1206 */
1207 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1208 while ( entryp != NULL ) {
1209 if ( p == entryp->procp && group_tag == entryp->group_tag ) {
1210
1211 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
1212 aio_anchor.lio_sync_workq_count--;
1213 AIO_UNLOCK;
1214
1215 if ( (entryp->flags & AIO_READ) != 0 ) {
1216 error = do_aio_read( entryp );
1217 }
1218 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1219 error = do_aio_write( entryp );
1220 }
1221 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1222 error = do_aio_fsync( entryp );
1223 }
1224 else {
1225 printf( "%s - unknown aio request - flags 0x%02X \n",
1226 __FUNCTION__, entryp->flags );
1227 error = EINVAL;
1228 }
1229 entryp->errorval = error;
1230 if ( error != 0 && call_result == -1 )
1231 call_result = EIO;
1232
1233 AIO_LOCK;
1234 /* we're done with the IO request so move it on the done queue */
1235 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
1236 aio_anchor.aio_done_count++;
1237 p->aio_done_count++;
1238
1239 /* need to start over since lio_sync_workq may have been changed while we */
1240 /* were away doing the IO. */
1241 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
1242 continue;
1243 } /* p == entryp->procp */
1244
1245 entryp = TAILQ_NEXT( entryp, aio_workq_link );
1246 } /* while ( entryp != NULL ) */
1247 } /* uap->mode == LIO_WAIT */
1248 AIO_UNLOCK;
1249
1250 /* call_result == -1 means we had no trouble queueing up requests */
1251 if ( call_result == -1 ) {
1252 call_result = 0;
1253 *retval = 0;
1254 }
1255
1256 ExitRoutine:
1257 if ( entryp_listp != NULL )
1258 FREE( entryp_listp, M_TEMP );
1259 if ( aiocbpp != NULL )
1260 FREE( aiocbpp, M_TEMP );
1261
1262 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1263 (int)p, call_result, 0, 0, 0 );
1264
1265 return( call_result );
1266
1267 } /* lio_listio */
1268
1269
1270 /*
1271 * aio worker thread. this is where all the real work gets done.
1272 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1273 * after new work is queued up.
1274 */
1275
1276 static void
1277 aio_work_thread( void )
1278 {
1279 aio_workq_entry *entryp;
1280
1281 for( ;; ) {
1282 AIO_LOCK;
1283 entryp = aio_get_some_work();
1284 if ( entryp == NULL ) {
1285 /*
1286 * aio worker threads wait for some work to get queued up
1287 * by aio_queue_async_request. Once some work gets queued
1288 * it will wake up one of these worker threads just before
1289 * returning to our caller in user land.
1290 */
1291 assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
1292 AIO_UNLOCK;
1293
1294 thread_block( (thread_continue_t)aio_work_thread );
1295 /* NOT REACHED */
1296 }
1297 else {
1298 int error;
1299 vm_map_t currentmap;
1300 vm_map_t oldmap = VM_MAP_NULL;
1301 task_t oldaiotask = TASK_NULL;
1302 struct uthread *uthreadp = NULL;
1303
1304 AIO_UNLOCK;
1305
1306 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1307 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1308
1309 /*
1310 * Assume the target's address space identity for the duration
1311 * of the IO.
1312 */
1313 currentmap = get_task_map( (current_proc())->task );
1314 if ( currentmap != entryp->aio_map ) {
1315 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1316 oldaiotask = uthreadp->uu_aio_task;
1317 uthreadp->uu_aio_task = entryp->procp->task;
1318 oldmap = vm_map_switch( entryp->aio_map );
1319 }
1320
1321 if ( (entryp->flags & AIO_READ) != 0 ) {
1322 error = do_aio_read( entryp );
1323 }
1324 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1325 error = do_aio_write( entryp );
1326 }
1327 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1328 error = do_aio_fsync( entryp );
1329 }
1330 else {
1331 printf( "%s - unknown aio request - flags 0x%02X \n",
1332 __FUNCTION__, entryp->flags );
1333 error = EINVAL;
1334 }
1335 entryp->errorval = error;
1336 if ( currentmap != entryp->aio_map ) {
1337 (void) vm_map_switch( oldmap );
1338 uthreadp->uu_aio_task = oldaiotask;
1339 }
1340
1341 /* we're done with the IO request so pop it off the active queue and */
1342 /* push it on the done queue */
1343 AIO_LOCK;
1344 TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1345 aio_anchor.aio_active_count--;
1346 entryp->procp->aio_active_count--;
1347 TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
1348 aio_anchor.aio_done_count++;
1349 entryp->procp->aio_done_count++;
1350 entryp->flags |= AIO_COMPLETION;
1351
1352 /* remove our reference to the user land map. */
1353 if ( VM_MAP_NULL != entryp->aio_map ) {
1354 vm_map_t my_map;
1355
1356 my_map = entryp->aio_map;
1357 entryp->aio_map = VM_MAP_NULL;
1358 AIO_UNLOCK; /* must unlock before calling vm_map_deallocate() */
1359 vm_map_deallocate( my_map );
1360 }
1361 else {
1362 AIO_UNLOCK;
1363 }
1364
1365 do_aio_completion( entryp );
1366
1367 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1368 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1369 entryp->returnval, 0 );
1370
1371 AIO_LOCK;
1372 entryp->flags &= ~AIO_COMPLETION;
1373 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
1374 vm_map_t my_map;
1375
1376 my_map = entryp->aio_map;
1377 entryp->aio_map = VM_MAP_NULL;
1378 AIO_UNLOCK;
1379 aio_free_request( entryp, my_map );
1380 }
1381 else
1382 AIO_UNLOCK;
1383 }
1384 } /* for ( ;; ) */
1385
1386 /* NOT REACHED */
1387
1388 } /* aio_work_thread */
1389
1390
1391 /*
1392 * aio_get_some_work - get the next async IO request that is ready to be executed.
1393 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1394 * IO requests at the time the aio_fsync call came in have completed.
1395 * NOTE - AIO_LOCK must be held by caller
1396 */
1397
1398 static aio_workq_entry *
1399 aio_get_some_work( void )
1400 {
1401 aio_workq_entry *entryp;
1402
1403 /* pop some work off the work queue and add to our active queue */
1404 for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
1405 entryp != NULL;
1406 entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
1407
1408 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1409 /* leave aio_fsync calls on the work queue if there are IO */
1410 /* requests on the active queue for the same file descriptor. */
1411 if ( aio_delay_fsync_request( entryp ) ) {
1412
1413 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1414 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1415 continue;
1416 }
1417 }
1418 break;
1419 }
1420
1421 if ( entryp != NULL ) {
1422 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1423 aio_anchor.aio_async_workq_count--;
1424 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
1425 aio_anchor.aio_active_count++;
1426 entryp->procp->aio_active_count++;
1427 }
1428
1429 return( entryp );
1430
1431 } /* aio_get_some_work */
1432
1433
1434 /*
1435 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1436 * this time. Delay will happen when there are any active IOs for the same file
1437 * descriptor that were queued at time the aio_sync call was queued.
1438 * NOTE - AIO_LOCK must be held by caller
1439 */
1440 static boolean_t
1441 aio_delay_fsync_request( aio_workq_entry *entryp )
1442 {
1443 aio_workq_entry *my_entryp;
1444
1445 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1446 if ( my_entryp->fsyncp != USER_ADDR_NULL &&
1447 entryp->uaiocbp == my_entryp->fsyncp &&
1448 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1449 return( TRUE );
1450 }
1451 }
1452
1453 return( FALSE );
1454
1455 } /* aio_delay_fsync_request */
1456
1457
1458 /*
1459 * aio_queue_async_request - queue up an async IO request on our work queue then
1460 * wake up one of our worker threads to do the actual work. We get a reference
1461 * to our caller's user land map in order to keep it around while we are
1462 * processing the request.
1463 */
1464
1465 static int
1466 aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1467 {
1468 aio_workq_entry *entryp;
1469 int result;
1470
1471 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1472 if ( entryp == NULL ) {
1473 result = EAGAIN;
1474 goto error_exit;
1475 }
1476 bzero( entryp, sizeof(*entryp) );
1477
1478 /* fill in the rest of the aio_workq_entry */
1479 entryp->procp = procp;
1480 entryp->uaiocbp = aiocbp;
1481 entryp->flags |= kindOfIO;
1482 entryp->aio_map = VM_MAP_NULL;
1483
1484 if ( !IS_64BIT_PROCESS(procp) ) {
1485 struct aiocb aiocb32;
1486
1487 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1488 if ( result == 0 )
1489 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1490 } else
1491 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1492
1493 if ( result != 0 ) {
1494 result = EAGAIN;
1495 goto error_exit;
1496 }
1497
1498 /* do some more validation on the aiocb and embedded file descriptor */
1499 result = aio_validate( entryp );
1500 if ( result != 0 )
1501 goto error_exit;
1502
1503 /* get a reference to the user land map in order to keep it around */
1504 entryp->aio_map = get_task_map( procp->task );
1505 vm_map_reference( entryp->aio_map );
1506
1507 AIO_LOCK;
1508
1509 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1510 AIO_UNLOCK;
1511 result = EAGAIN;
1512 goto error_exit;
1513 }
1514
1515 /* check our aio limits to throttle bad or rude user land behavior */
1516 if ( aio_get_all_queues_count( ) >= aio_max_requests ||
1517 aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
1518 AIO_UNLOCK;
1519 result = EAGAIN;
1520 goto error_exit;
1521 }
1522
1523 /*
1524 * aio_fsync calls sync up all async IO requests queued at the time
1525 * the aio_fsync call was made. So we mark each currently queued async
1526 * IO with a matching file descriptor as must complete before we do the
1527 * fsync. We set the fsyncp field of each matching async IO
1528 * request with the aiocb pointer passed in on the aio_fsync call to
1529 * know which IOs must complete before we process the aio_fsync call.
1530 */
1531 if ( (kindOfIO & AIO_FSYNC) != 0 )
1532 aio_mark_requests( entryp );
1533
1534 /* queue up on our aio asynchronous work queue */
1535 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1536 aio_anchor.aio_async_workq_count++;
1537
1538 wakeup_one( (caddr_t) &aio_anchor.aio_async_workq );
1539 AIO_UNLOCK;
1540
1541 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1542 (int)procp, (int)aiocbp, 0, 0, 0 );
1543
1544 return( 0 );
1545
1546 error_exit:
1547 if ( entryp != NULL ) {
1548 /* this entry has not been queued up so no worries about unlocked */
1549 /* state and aio_map */
1550 aio_free_request( entryp, entryp->aio_map );
1551 }
1552
1553 return( result );
1554
1555 } /* aio_queue_async_request */
1556
1557
1558 /*
1559 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1560 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1561 * our caller. We get a reference to our caller's user land map in order to keep
1562 * it around while we are processing the request.
1563 * lio_listio calls behave differently at completion they do completion notification
1564 * when all async IO requests have completed. We use group_tag to tag IO requests
1565 * that behave in the delay notification manner.
1566 */
1567
1568 static int
1569 lio_create_async_entry(proc_t procp, user_addr_t aiocbp,
1570 user_addr_t sigp, long group_tag,
1571 aio_workq_entry **entrypp )
1572 {
1573 aio_workq_entry *entryp;
1574 int result;
1575
1576 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1577 if ( entryp == NULL ) {
1578 result = EAGAIN;
1579 goto error_exit;
1580 }
1581 bzero( entryp, sizeof(*entryp) );
1582
1583 /* fill in the rest of the aio_workq_entry */
1584 entryp->procp = procp;
1585 entryp->uaiocbp = aiocbp;
1586 entryp->flags |= AIO_LIO;
1587 entryp->group_tag = group_tag;
1588 entryp->aio_map = VM_MAP_NULL;
1589
1590 if ( !IS_64BIT_PROCESS(procp) ) {
1591 struct aiocb aiocb32;
1592
1593 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1594 if ( result == 0 )
1595 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1596 } else
1597 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1598
1599 if ( result != 0 ) {
1600 result = EAGAIN;
1601 goto error_exit;
1602 }
1603
1604 /* look for lio_listio LIO_NOP requests and ignore them. */
1605 /* Not really an error, but we need to free our aio_workq_entry. */
1606 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1607 result = 0;
1608 goto error_exit;
1609 }
1610
1611 /* use sigevent passed in to lio_listio for each of our calls, but only */
1612 /* do completion notification after the last request completes. */
1613 if ( sigp != USER_ADDR_NULL ) {
1614 if ( !IS_64BIT_PROCESS(procp) ) {
1615 struct sigevent sigevent32;
1616
1617 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1618 if ( result == 0 ) {
1619 /* also need to munge aio_sigevent since it contains pointers */
1620 /* special case here. since we do not know if sigev_value is an */
1621 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
1622 /* means if we send this info back to user space we need to remember */
1623 /* sigev_value was not expanded for the 32-bit case. */
1624 /* NOTE - this does NOT affect us since we don't support sigev_value */
1625 /* yet in the aio context. */
1626 //LP64
1627 entryp->aiocb.aio_sigevent.sigev_notify = sigevent32.sigev_notify;
1628 entryp->aiocb.aio_sigevent.sigev_signo = sigevent32.sigev_signo;
1629 entryp->aiocb.aio_sigevent.sigev_value.size_equivalent.sival_int =
1630 sigevent32.sigev_value.sival_int;
1631 entryp->aiocb.aio_sigevent.sigev_notify_function =
1632 CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1633 entryp->aiocb.aio_sigevent.sigev_notify_attributes =
1634 CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1635 }
1636 } else
1637 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
1638
1639 if ( result != 0 ) {
1640 result = EAGAIN;
1641 goto error_exit;
1642 }
1643 }
1644
1645 /* do some more validation on the aiocb and embedded file descriptor */
1646 result = aio_validate( entryp );
1647 if ( result != 0 )
1648 goto error_exit;
1649
1650 /* get a reference to the user land map in order to keep it around */
1651 entryp->aio_map = get_task_map( procp->task );
1652 vm_map_reference( entryp->aio_map );
1653
1654 *entrypp = entryp;
1655 return( 0 );
1656
1657 error_exit:
1658 if ( entryp != NULL )
1659 zfree( aio_workq_zonep, entryp );
1660
1661 return( result );
1662
1663 } /* lio_create_async_entry */
1664
1665
1666 /*
1667 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1668 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1669 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1670 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1671 * NOTE - AIO_LOCK must be held by caller
1672 */
1673
1674 static void
1675 aio_mark_requests( aio_workq_entry *entryp )
1676 {
1677 aio_workq_entry *my_entryp;
1678
1679 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
1680 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1681 my_entryp->fsyncp = entryp->uaiocbp;
1682 }
1683 }
1684
1685 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1686 if ( entryp->procp == my_entryp->procp &&
1687 entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
1688 my_entryp->fsyncp = entryp->uaiocbp;
1689 }
1690 }
1691
1692 } /* aio_mark_requests */
1693
1694
1695 /*
1696 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1697 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1698 * our caller.
1699 * lio_listio calls behave differently at completion they do completion notification
1700 * when all async IO requests have completed. We use group_tag to tag IO requests
1701 * that behave in the delay notification manner.
1702 */
1703
1704 static int
1705 lio_create_sync_entry(proc_t procp, user_addr_t aiocbp,
1706 long group_tag, aio_workq_entry **entrypp )
1707 {
1708 aio_workq_entry *entryp;
1709 int result;
1710
1711 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1712 if ( entryp == NULL ) {
1713 result = EAGAIN;
1714 goto error_exit;
1715 }
1716 bzero( entryp, sizeof(*entryp) );
1717
1718 /* fill in the rest of the aio_workq_entry */
1719 entryp->procp = procp;
1720 entryp->uaiocbp = aiocbp;
1721 entryp->flags |= AIO_LIO;
1722 entryp->group_tag = group_tag;
1723 entryp->aio_map = VM_MAP_NULL;
1724
1725 if ( !IS_64BIT_PROCESS(procp) ) {
1726 struct aiocb aiocb32;
1727
1728 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1729 if ( result == 0 )
1730 do_munge_aiocb( &aiocb32, &entryp->aiocb );
1731 } else
1732 result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
1733
1734 if ( result != 0 ) {
1735 result = EAGAIN;
1736 goto error_exit;
1737 }
1738
1739 /* look for lio_listio LIO_NOP requests and ignore them. */
1740 /* Not really an error, but we need to free our aio_workq_entry. */
1741 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
1742 result = 0;
1743 goto error_exit;
1744 }
1745
1746 result = aio_validate( entryp );
1747 if ( result != 0 ) {
1748 goto error_exit;
1749 }
1750
1751 *entrypp = entryp;
1752 return( 0 );
1753
1754 error_exit:
1755 if ( entryp != NULL )
1756 zfree( aio_workq_zonep, entryp );
1757
1758 return( result );
1759
1760 } /* lio_create_sync_entry */
1761
1762
1763 /*
1764 * aio_free_request - remove our reference on the user land map and
1765 * free the work queue entry resources.
1766 * We are not holding the lock here thus aio_map is passed in and
1767 * zeroed while we did have the lock.
1768 */
1769
1770 static int
1771 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
1772 {
1773 /* remove our reference to the user land map. */
1774 if ( VM_MAP_NULL != the_map ) {
1775 vm_map_deallocate( the_map );
1776 }
1777
1778 zfree( aio_workq_zonep, entryp );
1779
1780 return( 0 );
1781
1782 } /* aio_free_request */
1783
1784
1785 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1786 */
1787
1788 static int
1789 aio_validate( aio_workq_entry *entryp )
1790 {
1791 struct fileproc *fp;
1792 int flag;
1793 int result;
1794
1795 result = 0;
1796
1797 if ( (entryp->flags & AIO_LIO) != 0 ) {
1798 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
1799 entryp->flags |= AIO_READ;
1800 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
1801 entryp->flags |= AIO_WRITE;
1802 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
1803 return( 0 );
1804 else
1805 return( EINVAL );
1806 }
1807
1808 flag = FREAD;
1809 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
1810 flag = FWRITE;
1811 }
1812
1813 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
1814 // LP64todo - does max value for aio_nbytes need to grow?
1815 if ( entryp->aiocb.aio_nbytes > INT_MAX ||
1816 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1817 entryp->aiocb.aio_offset < 0 )
1818 return( EINVAL );
1819 }
1820
1821 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1822 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1823 * sigev_notify_function, and sigev_notify_attributes are ignored.
1824 */
1825 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
1826 int signum;
1827 /* make sure we have a valid signal number */
1828 signum = entryp->aiocb.aio_sigevent.sigev_signo;
1829 if ( signum <= 0 || signum >= NSIG ||
1830 signum == SIGKILL || signum == SIGSTOP )
1831 return (EINVAL);
1832 }
1833 else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
1834 return (EINVAL);
1835
1836 /* validate the file descriptor and that the file was opened
1837 * for the appropriate read / write access.
1838 */
1839 proc_fdlock(entryp->procp);
1840
1841 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
1842 if ( result == 0 ) {
1843 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
1844 /* we don't have read or write access */
1845 result = EBADF;
1846 }
1847 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
1848 /* this is not a file */
1849 result = ESPIPE;
1850 } else
1851 fp->f_flags |= FP_AIOISSUED;
1852
1853 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
1854 }
1855 else {
1856 result = EBADF;
1857 }
1858
1859 proc_fdunlock(entryp->procp);
1860
1861 return( result );
1862
1863 } /* aio_validate */
1864
1865
1866 /*
1867 * aio_get_process_count - runs through our queues that hold outstanding
1868 * async IO reqests and totals up number of requests for the given
1869 * process.
1870 * NOTE - caller must hold aio lock!
1871 */
1872
1873 static int
1874 aio_get_process_count(proc_t procp )
1875 {
1876 aio_workq_entry *entryp;
1877 int count;
1878
1879 /* begin with count of completed async IO requests for this process */
1880 count = procp->aio_done_count;
1881
1882 /* add in count of active async IO requests for this process */
1883 count += procp->aio_active_count;
1884
1885 /* look for matches on our queue of asynchronous todo work */
1886 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
1887 if ( procp == entryp->procp ) {
1888 count++;
1889 }
1890 }
1891
1892 /* look for matches on our queue of synchronous todo work */
1893 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
1894 if ( procp == entryp->procp ) {
1895 count++;
1896 }
1897 }
1898
1899 return( count );
1900
1901 } /* aio_get_process_count */
1902
1903
1904 /*
1905 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1906 * NOTE - caller must hold aio lock!
1907 */
1908
1909 static int
1910 aio_get_all_queues_count( void )
1911 {
1912 int count;
1913
1914 count = aio_anchor.aio_async_workq_count;
1915 count += aio_anchor.lio_sync_workq_count;
1916 count += aio_anchor.aio_active_count;
1917 count += aio_anchor.aio_done_count;
1918
1919 return( count );
1920
1921 } /* aio_get_all_queues_count */
1922
1923
1924 /*
1925 * do_aio_completion. Handle async IO completion.
1926 */
1927
1928 static void
1929 do_aio_completion( aio_workq_entry *entryp )
1930 {
1931 /* signal user land process if appropriate */
1932 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1933 (entryp->flags & AIO_DISABLE) == 0 ) {
1934
1935 /*
1936 * if group_tag is non zero then make sure this is the last IO request
1937 * in the group before we signal.
1938 */
1939 if ( entryp->group_tag == 0 ||
1940 (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
1941 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
1942 (int)entryp->procp, (int)entryp->uaiocbp,
1943 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
1944
1945 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
1946 return;
1947 }
1948 }
1949
1950 /*
1951 * need to handle case where a process is trying to exit, exec, or close
1952 * and is currently waiting for active aio requests to complete. If
1953 * AIO_WAITING is set then we need to look to see if there are any
1954 * other requests in the active queue for this process. If there are
1955 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1956 * there are some still active then do nothing - we only want to wakeup
1957 * when all active aio requests for the process are complete.
1958 */
1959 if ( (entryp->flags & AIO_WAITING) != 0 ) {
1960 int active_requests;
1961
1962 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
1963 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1964
1965 AIO_LOCK;
1966 active_requests = aio_active_requests_for_process( entryp->procp );
1967 //AIO_UNLOCK;
1968 if ( active_requests < 1 ) {
1969 /* no active aio requests for this process, continue exiting */
1970 wakeup_one( (caddr_t) &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
1971
1972 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
1973 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1974 }
1975 AIO_UNLOCK;
1976 return;
1977 }
1978
1979 /*
1980 * aio_suspend case when a signal was not requested. In that scenario we
1981 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1982 * NOTE - the assumption here is that this wakeup call is inexpensive.
1983 * we really only need to do this when an aio_suspend call is pending.
1984 * If we find the wakeup call should be avoided we could mark the
1985 * async IO requests given in the list provided by aio_suspend and only
1986 * call wakeup for them. If we do mark them we should unmark them after
1987 * the aio_suspend wakes up.
1988 */
1989 AIO_LOCK;
1990 wakeup_one( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
1991 AIO_UNLOCK;
1992
1993 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
1994 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1995
1996 return;
1997
1998 } /* do_aio_completion */
1999
2000
2001 /*
2002 * aio_last_group_io - checks to see if this is the last unfinished IO request
2003 * for the given group_tag. Returns TRUE if there are no other active IO
2004 * requests for this group or FALSE if the are active IO requests
2005 * NOTE - AIO_LOCK must be held by caller
2006 */
2007
2008 static boolean_t
2009 aio_last_group_io( aio_workq_entry *entryp )
2010 {
2011 aio_workq_entry *my_entryp;
2012
2013 /* look for matches on our queue of active async IO requests */
2014 TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
2015 if ( my_entryp->group_tag == entryp->group_tag )
2016 return( FALSE );
2017 }
2018
2019 /* look for matches on our queue of asynchronous todo work */
2020 TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2021 if ( my_entryp->group_tag == entryp->group_tag )
2022 return( FALSE );
2023 }
2024
2025 /* look for matches on our queue of synchronous todo work */
2026 TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2027 if ( my_entryp->group_tag == entryp->group_tag )
2028 return( FALSE );
2029 }
2030
2031 return( TRUE );
2032
2033 } /* aio_last_group_io */
2034
2035
2036 /*
2037 * do_aio_read
2038 */
2039 static int
2040 do_aio_read( aio_workq_entry *entryp )
2041 {
2042 struct fileproc *fp;
2043 int error;
2044 struct vfs_context context;
2045
2046 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2047 return(error);
2048 if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2049 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2050 return(EBADF);
2051 }
2052
2053 /*
2054 * <rdar://4714366>
2055 * Needs vfs_context_t from vfs_context_create() in entryp!
2056 */
2057 context.vc_thread = proc_thread(entryp->procp); /* XXX */
2058 context.vc_ucred = fp->f_fglob->fg_cred;
2059
2060 error = dofileread(&context, fp,
2061 entryp->aiocb.aio_buf,
2062 entryp->aiocb.aio_nbytes,
2063 entryp->aiocb.aio_offset, FOF_OFFSET,
2064 &entryp->returnval);
2065 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2066
2067 return( error );
2068
2069 } /* do_aio_read */
2070
2071
2072 /*
2073 * do_aio_write
2074 */
2075 static int
2076 do_aio_write( aio_workq_entry *entryp )
2077 {
2078 struct fileproc *fp;
2079 int error;
2080 struct vfs_context context;
2081
2082 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2083 return(error);
2084 if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2085 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2086 return(EBADF);
2087 }
2088
2089 /*
2090 * <rdar://4714366>
2091 * Needs vfs_context_t from vfs_context_create() in entryp!
2092 */
2093 context.vc_thread = proc_thread(entryp->procp); /* XXX */
2094 context.vc_ucred = fp->f_fglob->fg_cred;
2095
2096 /* NB: tell dofilewrite the offset, and to use the proc cred */
2097 error = dofilewrite(&context,
2098 fp,
2099 entryp->aiocb.aio_buf,
2100 entryp->aiocb.aio_nbytes,
2101 entryp->aiocb.aio_offset,
2102 FOF_OFFSET | FOF_PCRED,
2103 &entryp->returnval);
2104
2105 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2106
2107 return( error );
2108
2109 } /* do_aio_write */
2110
2111
2112 /*
2113 * aio_active_requests_for_process - return number of active async IO
2114 * requests for the given process.
2115 * NOTE - caller must hold aio lock!
2116 */
2117
2118 static int
2119 aio_active_requests_for_process(proc_t procp )
2120 {
2121
2122 return( procp->aio_active_count );
2123
2124 } /* aio_active_requests_for_process */
2125
2126
2127 /*
2128 * do_aio_fsync
2129 */
2130 static int
2131 do_aio_fsync( aio_workq_entry *entryp )
2132 {
2133 struct vfs_context context;
2134 struct vnode *vp;
2135 struct fileproc *fp;
2136 int error;
2137
2138 /*
2139 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2140 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2141 * The following was shamelessly extracted from fsync() implementation.
2142 */
2143
2144 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2145 if ( error == 0 ) {
2146 if ( (error = vnode_getwithref(vp)) ) {
2147 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2148 entryp->returnval = -1;
2149 return(error);
2150 }
2151 context.vc_thread = current_thread();
2152 context.vc_ucred = fp->f_fglob->fg_cred;
2153
2154 error = VNOP_FSYNC( vp, MNT_WAIT, &context);
2155
2156 (void)vnode_put(vp);
2157
2158 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2159 }
2160 if ( error != 0 )
2161 entryp->returnval = -1;
2162
2163 return( error );
2164
2165 } /* do_aio_fsync */
2166
2167
2168 /*
2169 * is_already_queued - runs through our queues to see if the given
2170 * aiocbp / process is there. Returns TRUE if there is a match
2171 * on any of our aio queues.
2172 * NOTE - callers must hold aio lock!
2173 */
2174
2175 static boolean_t
2176 is_already_queued(proc_t procp,
2177 user_addr_t aiocbp )
2178 {
2179 aio_workq_entry *entryp;
2180 boolean_t result;
2181
2182 result = FALSE;
2183
2184 /* look for matches on our queue of async IO requests that have completed */
2185 TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
2186 if ( aiocbp == entryp->uaiocbp ) {
2187 result = TRUE;
2188 goto ExitThisRoutine;
2189 }
2190 }
2191
2192 /* look for matches on our queue of active async IO requests */
2193 TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
2194 if ( aiocbp == entryp->uaiocbp ) {
2195 result = TRUE;
2196 goto ExitThisRoutine;
2197 }
2198 }
2199
2200 /* look for matches on our queue of asynchronous todo work */
2201 TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
2202 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2203 result = TRUE;
2204 goto ExitThisRoutine;
2205 }
2206 }
2207
2208 /* look for matches on our queue of synchronous todo work */
2209 TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
2210 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
2211 result = TRUE;
2212 goto ExitThisRoutine;
2213 }
2214 }
2215
2216 ExitThisRoutine:
2217 return( result );
2218
2219 } /* is_already_queued */
2220
2221
2222 /*
2223 * aio initialization
2224 */
2225 __private_extern__ void
2226 aio_init( void )
2227 {
2228 int i;
2229
2230 aio_lock_grp_attr = lck_grp_attr_alloc_init();
2231 aio_lock_grp = lck_grp_alloc_init("aio", aio_lock_grp_attr);
2232 aio_lock_attr = lck_attr_alloc_init();
2233
2234 aio_lock = lck_mtx_alloc_init(aio_lock_grp, aio_lock_attr);
2235
2236 AIO_LOCK;
2237 TAILQ_INIT( &aio_anchor.aio_async_workq );
2238 TAILQ_INIT( &aio_anchor.lio_sync_workq );
2239 aio_anchor.aio_async_workq_count = 0;
2240 aio_anchor.lio_sync_workq_count = 0;
2241 aio_anchor.aio_active_count = 0;
2242 aio_anchor.aio_done_count = 0;
2243 AIO_UNLOCK;
2244
2245 i = sizeof( aio_workq_entry );
2246 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2247
2248 _aio_create_worker_threads( aio_worker_threads );
2249
2250 return;
2251
2252 } /* aio_init */
2253
2254
2255 /*
2256 * aio worker threads created here.
2257 */
2258 __private_extern__ void
2259 _aio_create_worker_threads( int num )
2260 {
2261 int i;
2262
2263 /* create some worker threads to handle the async IO requests */
2264 for ( i = 0; i < num; i++ ) {
2265 thread_t myThread;
2266
2267 myThread = kernel_thread( kernel_task, aio_work_thread );
2268 if ( THREAD_NULL == myThread ) {
2269 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2270 }
2271 }
2272
2273 return;
2274
2275 } /* _aio_create_worker_threads */
2276
2277 /*
2278 * Return the current activation utask
2279 */
2280 task_t
2281 get_aiotask(void)
2282 {
2283 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2284 }
2285
2286
2287 /*
2288 * In the case of an aiocb from a
2289 * 32-bit process we need to expand some longs and pointers to the correct
2290 * sizes in order to let downstream code always work on the same type of
2291 * aiocb (in our case that is a user_aiocb)
2292 */
2293 static void
2294 do_munge_aiocb( struct aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2295 {
2296 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2297 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2298 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2299 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2300 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2301 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2302
2303 /* special case here. since we do not know if sigev_value is an */
2304 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2305 /* means if we send this info back to user space we need to remember */
2306 /* sigev_value was not expanded for the 32-bit case. */
2307 /* NOTE - this does NOT affect us since we don't support sigev_value */
2308 /* yet in the aio context. */
2309 //LP64
2310 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2311 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2312 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2313 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2314 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2315 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2316 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2317 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2318 }