]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_aio.c
xnu-4903.221.2.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
CommitLineData
55e303ae 1/*
39037602 2 * Copyright (c) 2003-2016 Apple Inc. All rights reserved.
55e303ae 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
55e303ae 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
55e303ae
A
27 */
28
29
30/*
31 * todo:
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
38 */
39
40
41/*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45#include <sys/systm.h>
55e303ae 46#include <sys/fcntl.h>
91447636 47#include <sys/file_internal.h>
55e303ae
A
48#include <sys/filedesc.h>
49#include <sys/kernel.h>
91447636 50#include <sys/vnode_internal.h>
55e303ae 51#include <sys/malloc.h>
91447636 52#include <sys/mount_internal.h>
55e303ae 53#include <sys/param.h>
91447636 54#include <sys/proc_internal.h>
55e303ae
A
55#include <sys/sysctl.h>
56#include <sys/unistd.h>
57#include <sys/user.h>
58
59#include <sys/aio_kern.h>
91447636 60#include <sys/sysproto.h>
55e303ae
A
61
62#include <machine/limits.h>
91447636
A
63
64#include <mach/mach_types.h>
65#include <kern/kern_types.h>
3e170ce0 66#include <kern/waitq.h>
55e303ae
A
67#include <kern/zalloc.h>
68#include <kern/task.h>
91447636
A
69#include <kern/sched_prim.h>
70
71#include <vm/vm_map.h>
55e303ae 72
b0d623f7
A
73#include <libkern/OSAtomic.h>
74
55e303ae
A
75#include <sys/kdebug.h>
76#define AIO_work_queued 1
77#define AIO_worker_wake 2
78#define AIO_completion_sig 3
79#define AIO_completion_cleanup_wait 4
80#define AIO_completion_cleanup_wake 5
81#define AIO_completion_suspend_wake 6
82#define AIO_fsync_delay 7
83#define AIO_cancel 10
84#define AIO_cancel_async_workq 11
85#define AIO_cancel_sync_workq 12
86#define AIO_cancel_activeq 13
87#define AIO_cancel_doneq 14
88#define AIO_fsync 20
89#define AIO_read 30
90#define AIO_write 40
91#define AIO_listio 50
92#define AIO_error 60
93#define AIO_error_val 61
94#define AIO_error_activeq 62
95#define AIO_error_workq 63
96#define AIO_return 70
97#define AIO_return_val 71
98#define AIO_return_activeq 72
99#define AIO_return_workq 73
100#define AIO_exec 80
101#define AIO_exit 90
102#define AIO_exit_sleep 91
103#define AIO_close 100
104#define AIO_close_sleep 101
105#define AIO_suspend 110
106#define AIO_suspend_sleep 111
107#define AIO_worker_thread 120
108
109#if 0
110#undef KERNEL_DEBUG
111#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
112#endif
113
114/*
115 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
116 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
117 * (proc.aio_activeq) when one of our worker threads start the IO.
118 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
119 * when the IO request completes. The request remains on aio_doneq until
120 * user process calls aio_return or the process exits, either way that is our
121 * trigger to release aio resources.
122 */
b0d623f7
A
123typedef struct aio_workq {
124 TAILQ_HEAD(, aio_workq_entry) aioq_entries;
125 int aioq_count;
126 lck_mtx_t aioq_mtx;
3e170ce0 127 struct waitq aioq_waitq;
b0d623f7
A
128} *aio_workq_t;
129
130#define AIO_NUM_WORK_QUEUES 1
55e303ae
A
131struct aio_anchor_cb
132{
b0d623f7
A
133 volatile int32_t aio_inflight_count; /* entries that have been taken from a workq */
134 volatile int32_t aio_done_count; /* entries on all done queues (proc.aio_doneq) */
135 volatile int32_t aio_total_count; /* total extant entries */
136
137 /* Hash table of queues here */
138 int aio_num_workqs;
139 struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
55e303ae
A
140};
141typedef struct aio_anchor_cb aio_anchor_cb;
142
b0d623f7
A
143struct aio_lio_context
144{
145 int io_waiter;
146 int io_issued;
147 int io_completed;
148};
149typedef struct aio_lio_context aio_lio_context;
150
55e303ae
A
151
152/*
153 * Notes on aio sleep / wake channels.
154 * We currently pick a couple fields within the proc structure that will allow
155 * us sleep channels that currently do not collide with any other kernel routines.
156 * At this time, for binary compatibility reasons, we cannot create new proc fields.
157 */
b0d623f7
A
158#define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
159#define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
55e303ae 160
b0d623f7
A
161#define ASSERT_AIO_FROM_PROC(aiop, theproc) \
162 if ((aiop)->procp != (theproc)) { \
163 panic("AIO on a proc list that does not belong to that proc.\n"); \
164 }
55e303ae
A
165
166/*
167 * LOCAL PROTOTYPES
168 */
b0d623f7
A
169static void aio_proc_lock(proc_t procp);
170static void aio_proc_lock_spin(proc_t procp);
171static void aio_proc_unlock(proc_t procp);
172static lck_mtx_t* aio_proc_mutex(proc_t procp);
173static void aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
174static void aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
175static int aio_get_process_count(proc_t procp );
176static int aio_active_requests_for_process(proc_t procp );
177static int aio_proc_active_requests_for_file(proc_t procp, int fd);
178static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp );
179static boolean_t should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
180
181static void aio_entry_lock(aio_workq_entry *entryp);
182static void aio_entry_lock_spin(aio_workq_entry *entryp);
183static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
184static lck_mtx_t* aio_entry_mutex(__unused aio_workq_entry *entryp);
185static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
186static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
187static void aio_entry_ref_locked(aio_workq_entry *entryp);
188static void aio_entry_unref_locked(aio_workq_entry *entryp);
189static void aio_entry_ref(aio_workq_entry *entryp);
190static void aio_entry_unref(aio_workq_entry *entryp);
191static void aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
192 int wait_for_completion, boolean_t disable_notification);
193static int aio_entry_try_workq_remove(aio_workq_entry *entryp);
55e303ae 194static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
b0d623f7
A
195static int aio_free_request(aio_workq_entry *entryp);
196
197static void aio_workq_init(aio_workq_t wq);
198static void aio_workq_lock_spin(aio_workq_t wq);
199static void aio_workq_unlock(aio_workq_t wq);
200static lck_mtx_t* aio_workq_mutex(aio_workq_t wq);
201
202static void aio_work_thread( void );
203static aio_workq_entry *aio_get_some_work( void );
204
205static int aio_get_all_queues_count( void );
206static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
207static int aio_validate( aio_workq_entry *entryp );
208static int aio_increment_total_count(void);
209static int aio_decrement_total_count(void);
210
211static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
212static void do_aio_completion( aio_workq_entry *entryp );
213static int do_aio_fsync( aio_workq_entry *entryp );
214static int do_aio_read( aio_workq_entry *entryp );
215static int do_aio_write( aio_workq_entry *entryp );
216static void do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
217static void do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
218static int lio_create_entry(proc_t procp,
219 user_addr_t aiocbp,
220 void *group_tag,
221 aio_workq_entry **entrypp );
222static aio_workq_entry *aio_create_queue_entry(proc_t procp,
223 user_addr_t aiocbp,
224 void *group_tag,
225 int kindOfIO);
226static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
227static void free_lio_context(aio_lio_context* context);
228static void aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
229
230#define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
231#define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
232#define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
91447636 233
55e303ae
A
234/*
235 * EXTERNAL PROTOTYPES
236 */
237
238/* in ...bsd/kern/sys_generic.c */
b0d623f7
A
239extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
240 user_addr_t bufp, user_size_t nbyte,
241 off_t offset, int flags, user_ssize_t *retval );
242extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
243 user_addr_t bufp, user_size_t nbyte, off_t offset,
244 int flags, user_ssize_t *retval );
245#if DEBUG
246static uint32_t lio_contexts_alloced = 0;
247#endif /* DEBUG */
55e303ae
A
248
249/*
250 * aio external global variables.
251 */
b0d623f7 252extern int aio_max_requests; /* AIO_MAX - configurable */
55e303ae 253extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
b0d623f7 254extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
55e303ae
A
255
256
257/*
258 * aio static variables.
259 */
b0d623f7
A
260static aio_anchor_cb aio_anchor;
261static lck_grp_t *aio_proc_lock_grp;
262static lck_grp_t *aio_entry_lock_grp;
263static lck_grp_t *aio_queue_lock_grp;
264static lck_attr_t *aio_lock_attr;
265static lck_grp_attr_t *aio_lock_grp_attr;
266static struct zone *aio_workq_zonep;
267static lck_mtx_t aio_entry_mtx;
268static lck_mtx_t aio_proc_mtx;
269
270static void
271aio_entry_lock(__unused aio_workq_entry *entryp)
272{
273 lck_mtx_lock(&aio_entry_mtx);
274}
275
276static void
277aio_entry_lock_spin(__unused aio_workq_entry *entryp)
278{
279 lck_mtx_lock_spin(&aio_entry_mtx);
280}
281
282static void
283aio_entry_unlock(__unused aio_workq_entry *entryp)
284{
285 lck_mtx_unlock(&aio_entry_mtx);
286}
287
288/* Hash */
289static aio_workq_t
290aio_entry_workq(__unused aio_workq_entry *entryp)
291{
292 return &aio_anchor.aio_async_workqs[0];
293}
294
295static lck_mtx_t*
296aio_entry_mutex(__unused aio_workq_entry *entryp)
297{
298 return &aio_entry_mtx;
299}
300
301static void
302aio_workq_init(aio_workq_t wq)
303{
304 TAILQ_INIT(&wq->aioq_entries);
305 wq->aioq_count = 0;
306 lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
39037602 307 waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
b0d623f7
A
308}
309
310
311/*
312 * Can be passed a queue which is locked spin.
313 */
314static void
315aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
316{
317 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
318
319 if (entryp->aio_workq_link.tqe_prev == NULL) {
320 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
321 }
322
323 TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
324 queue->aioq_count--;
325 entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
326
327 if (queue->aioq_count < 0) {
328 panic("Negative count on a queue.\n");
329 }
330}
331
332static void
333aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
334{
335 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
336
337 TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
338 if (queue->aioq_count < 0) {
339 panic("Negative count on a queue.\n");
340 }
341 queue->aioq_count++;
342}
343
344static void
345aio_proc_lock(proc_t procp)
346{
347 lck_mtx_lock(aio_proc_mutex(procp));
348}
349
350static void
351aio_proc_lock_spin(proc_t procp)
352{
353 lck_mtx_lock_spin(aio_proc_mutex(procp));
354}
355
356static void
357aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
358{
359 ASSERT_AIO_PROC_LOCK_OWNED(procp);
360
361 TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
362 TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
363 procp->p_aio_active_count--;
364 OSIncrementAtomic(&aio_anchor.aio_done_count);
365}
366
367static void
368aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
369{
370 TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
371 OSDecrementAtomic(&aio_anchor.aio_done_count);
372 aio_decrement_total_count();
373 procp->p_aio_total_count--;
374}
375
376static void
377aio_proc_unlock(proc_t procp)
378{
379 lck_mtx_unlock(aio_proc_mutex(procp));
380}
381
382static lck_mtx_t*
383aio_proc_mutex(proc_t procp)
384{
385 return &procp->p_mlock;
386}
387
388static void
389aio_entry_ref_locked(aio_workq_entry *entryp)
390{
391 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
392
393 if (entryp->aio_refcount < 0) {
394 panic("AIO workq entry with a negative refcount.\n");
395 }
396 entryp->aio_refcount++;
397}
398
399
400/* Return 1 if you've freed it */
401static void
402aio_entry_unref_locked(aio_workq_entry *entryp)
403{
404 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
405
406 entryp->aio_refcount--;
407 if (entryp->aio_refcount < 0) {
408 panic("AIO workq entry with a negative refcount.\n");
409 }
410}
411
412static void
413aio_entry_ref(aio_workq_entry *entryp)
414{
415 aio_entry_lock_spin(entryp);
416 aio_entry_ref_locked(entryp);
417 aio_entry_unlock(entryp);
418}
419static void
420aio_entry_unref(aio_workq_entry *entryp)
421{
422 aio_entry_lock_spin(entryp);
423 aio_entry_unref_locked(entryp);
424
425 if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
426 aio_entry_unlock(entryp);
427 aio_free_request(entryp);
428 } else {
429 aio_entry_unlock(entryp);
430 }
431
432 return;
433}
434
435static void
436aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
437{
438 aio_entry_lock_spin(entryp);
439
440 if (cancelled) {
441 aio_entry_ref_locked(entryp);
442 entryp->errorval = ECANCELED;
443 entryp->returnval = -1;
444 }
445
446 if ( wait_for_completion ) {
447 entryp->flags |= wait_for_completion; /* flag for special completion processing */
448 }
449
450 if ( disable_notification ) {
451 entryp->flags |= AIO_DISABLE; /* Don't want a signal */
452 }
453
454 aio_entry_unlock(entryp);
455}
456
457static int
458aio_entry_try_workq_remove(aio_workq_entry *entryp)
459{
460 /* Can only be cancelled if it's still on a work queue */
461 if (entryp->aio_workq_link.tqe_prev != NULL) {
462 aio_workq_t queue;
463
464 /* Will have to check again under the lock */
465 queue = aio_entry_workq(entryp);
466 aio_workq_lock_spin(queue);
467 if (entryp->aio_workq_link.tqe_prev != NULL) {
468 aio_workq_remove_entry_locked(queue, entryp);
469 aio_workq_unlock(queue);
470 return 1;
471 } else {
472 aio_workq_unlock(queue);
473 }
474 }
55e303ae 475
b0d623f7
A
476 return 0;
477}
478
479static void
480aio_workq_lock_spin(aio_workq_t wq)
481{
482 lck_mtx_lock_spin(aio_workq_mutex(wq));
483}
55e303ae 484
b0d623f7
A
485static void
486aio_workq_unlock(aio_workq_t wq)
487{
488 lck_mtx_unlock(aio_workq_mutex(wq));
489}
55e303ae 490
b0d623f7
A
491static lck_mtx_t*
492aio_workq_mutex(aio_workq_t wq)
493{
494 return &wq->aioq_mtx;
495}
55e303ae
A
496
497/*
498 * aio_cancel - attempt to cancel one or more async IO requests currently
499 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
500 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
501 * is NULL then all outstanding async IO request for the given file
502 * descriptor are cancelled (if possible).
503 */
55e303ae 504int
2d21ac55 505aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
55e303ae 506{
91447636 507 struct user_aiocb my_aiocb;
55e303ae 508 int result;
55e303ae
A
509
510 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
511 (int)p, (int)uap->aiocbp, 0, 0, 0 );
512
513 /* quick check to see if there are any async IO requests queued up */
b0d623f7 514 if (aio_get_all_queues_count() < 1) {
2d21ac55
A
515 result = 0;
516 *retval = AIO_ALLDONE;
55e303ae
A
517 goto ExitRoutine;
518 }
519
520 *retval = -1;
91447636 521 if ( uap->aiocbp != USER_ADDR_NULL ) {
b0d623f7
A
522 if ( proc_is64bit(p) ) {
523 struct user64_aiocb aiocb64;
524
525 result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
526 if (result == 0 )
527 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
528
529 } else {
530 struct user32_aiocb aiocb32;
91447636
A
531
532 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
533 if ( result == 0 )
b0d623f7
A
534 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
535 }
91447636 536
55e303ae
A
537 if ( result != 0 ) {
538 result = EAGAIN;
539 goto ExitRoutine;
540 }
541
542 /* NOTE - POSIX standard says a mismatch between the file */
543 /* descriptor passed in and the file descriptor embedded in */
544 /* the aiocb causes unspecified results. We return EBADF in */
545 /* that situation. */
546 if ( uap->fd != my_aiocb.aio_fildes ) {
547 result = EBADF;
548 goto ExitRoutine;
549 }
550 }
b0d623f7
A
551
552 aio_proc_lock(p);
553 result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
554 ASSERT_AIO_PROC_LOCK_OWNED(p);
555 aio_proc_unlock(p);
55e303ae
A
556
557 if ( result != -1 ) {
558 *retval = result;
559 result = 0;
560 goto ExitRoutine;
561 }
562
563 result = EBADF;
564
565ExitRoutine:
566 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
567 (int)p, (int)uap->aiocbp, result, 0, 0 );
568
569 return( result );
570
571} /* aio_cancel */
572
573
574/*
575 * _aio_close - internal function used to clean up async IO requests for
576 * a file descriptor that is closing.
55e303ae
A
577 * THIS MAY BLOCK.
578 */
55e303ae 579__private_extern__ void
2d21ac55 580_aio_close(proc_t p, int fd )
55e303ae 581{
b0d623f7 582 int error;
55e303ae
A
583
584 /* quick check to see if there are any async IO requests queued up */
b0d623f7 585 if (aio_get_all_queues_count() < 1) {
55e303ae 586 return;
b0d623f7 587 }
55e303ae
A
588
589 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
590 (int)p, fd, 0, 0, 0 );
591
592 /* cancel all async IO requests on our todo queues for this file descriptor */
b0d623f7
A
593 aio_proc_lock(p);
594 error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
595 ASSERT_AIO_PROC_LOCK_OWNED(p);
55e303ae
A
596 if ( error == AIO_NOTCANCELED ) {
597 /*
598 * AIO_NOTCANCELED is returned when we find an aio request for this process
599 * and file descriptor on the active async IO queue. Active requests cannot
600 * be cancelled so we must wait for them to complete. We will get a special
601 * wake up call on our channel used to sleep for ALL active requests to
602 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
603 * when we must wait for all active aio requests.
604 */
605
606 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
607 (int)p, fd, 0, 0, 0 );
608
b0d623f7 609 while (aio_proc_active_requests_for_file(p, fd) > 0) {
39236c6e 610 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 );
b0d623f7
A
611 }
612
55e303ae 613 }
39236c6e
A
614
615 aio_proc_unlock(p);
616
55e303ae
A
617 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
618 (int)p, fd, 0, 0, 0 );
619
620 return;
621
622} /* _aio_close */
623
624
625/*
626 * aio_error - return the error status associated with the async IO
627 * request referred to by uap->aiocbp. The error status is the errno
628 * value that would be set by the corresponding IO request (read, wrtie,
629 * fdatasync, or sync).
630 */
55e303ae 631int
2d21ac55 632aio_error(proc_t p, struct aio_error_args *uap, int *retval )
55e303ae
A
633{
634 aio_workq_entry *entryp;
635 int error;
636
637 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
638 (int)p, (int)uap->aiocbp, 0, 0, 0 );
639
b0d623f7
A
640 /* see if there are any aios to check */
641 if (aio_get_all_queues_count() < 1) {
642 return EINVAL;
55e303ae
A
643 }
644
b0d623f7
A
645 aio_proc_lock(p);
646
55e303ae 647 /* look for a match on our queue of async IO requests that have completed */
b0d623f7 648 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
55e303ae 649 if ( entryp->uaiocbp == uap->aiocbp ) {
b0d623f7
A
650 ASSERT_AIO_FROM_PROC(entryp, p);
651
652 aio_entry_lock_spin(entryp);
55e303ae
A
653 *retval = entryp->errorval;
654 error = 0;
b0d623f7 655 aio_entry_unlock(entryp);
55e303ae
A
656 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
657 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
658 goto ExitRoutine;
659 }
660 }
661
662 /* look for a match on our queue of active async IO requests */
b0d623f7 663 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
55e303ae 664 if ( entryp->uaiocbp == uap->aiocbp ) {
b0d623f7 665 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae
A
666 *retval = EINPROGRESS;
667 error = 0;
668 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
669 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
670 goto ExitRoutine;
671 }
672 }
b0d623f7 673
55e303ae
A
674 error = EINVAL;
675
676ExitRoutine:
677 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
678 (int)p, (int)uap->aiocbp, error, 0, 0 );
b0d623f7 679 aio_proc_unlock(p);
55e303ae
A
680
681 return( error );
682
683} /* aio_error */
684
685
686/*
687 * aio_fsync - asynchronously force all IO operations associated
688 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
689 * queued at the time of the call to the synchronized completion state.
690 * NOTE - we do not support op O_DSYNC at this point since we do not support the
691 * fdatasync() call.
692 */
55e303ae 693int
2d21ac55 694aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
55e303ae
A
695{
696 int error;
697 int fsync_kind;
698
699 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
700 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
701
702 *retval = 0;
91447636
A
703 /* 0 := O_SYNC for binary backward compatibility with Panther */
704 if (uap->op == O_SYNC || uap->op == 0)
55e303ae 705 fsync_kind = AIO_FSYNC;
55e303ae
A
706 else if ( uap->op == O_DSYNC )
707 fsync_kind = AIO_DSYNC;
55e303ae
A
708 else {
709 *retval = -1;
710 error = EINVAL;
711 goto ExitRoutine;
712 }
713
714 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
715 if ( error != 0 )
716 *retval = -1;
717
718ExitRoutine:
719 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
720 (int)p, (int)uap->aiocbp, error, 0, 0 );
721
722 return( error );
723
724} /* aio_fsync */
725
726
727/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
728 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
729 * (uap->aiocbp->aio_buf).
730 */
55e303ae 731int
2d21ac55 732aio_read(proc_t p, struct aio_read_args *uap, int *retval )
55e303ae
A
733{
734 int error;
735
736 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
737 (int)p, (int)uap->aiocbp, 0, 0, 0 );
738
739 *retval = 0;
740
741 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
742 if ( error != 0 )
743 *retval = -1;
744
745 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
746 (int)p, (int)uap->aiocbp, error, 0, 0 );
747
748 return( error );
749
750} /* aio_read */
751
752
753/*
754 * aio_return - return the return status associated with the async IO
755 * request referred to by uap->aiocbp. The return status is the value
b0d623f7 756 * that would be returned by corresponding IO request (read, write,
55e303ae
A
757 * fdatasync, or sync). This is where we release kernel resources
758 * held for async IO call associated with the given aiocb pointer.
759 */
55e303ae 760int
2d21ac55 761aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
55e303ae
A
762{
763 aio_workq_entry *entryp;
764 int error;
b0d623f7 765 boolean_t proc_lock_held = FALSE;
55e303ae
A
766
767 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
768 (int)p, (int)uap->aiocbp, 0, 0, 0 );
769
b0d623f7
A
770 /* See if there are any entries to check */
771 if (aio_get_all_queues_count() < 1) {
55e303ae
A
772 error = EINVAL;
773 goto ExitRoutine;
774 }
775
b0d623f7
A
776 aio_proc_lock(p);
777 proc_lock_held = TRUE;
778 *retval = 0;
779
55e303ae 780 /* look for a match on our queue of async IO requests that have completed */
b0d623f7
A
781 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
782 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae 783 if ( entryp->uaiocbp == uap->aiocbp ) {
b0d623f7
A
784 /* Done and valid for aio_return(), pull it off the list */
785 aio_proc_remove_done_locked(p, entryp);
55e303ae 786
b0d623f7
A
787 /* Drop the proc lock, but keep the entry locked */
788 aio_entry_lock(entryp);
789 aio_proc_unlock(p);
790 proc_lock_held = FALSE;
791
55e303ae 792 *retval = entryp->returnval;
b0d623f7 793 error = 0;
55e303ae 794
b0d623f7
A
795 /* No references and off all lists, safe to free */
796 if (entryp->aio_refcount == 0) {
797 aio_entry_unlock(entryp);
798 aio_free_request(entryp);
55e303ae 799 }
b0d623f7
A
800 else {
801 /* Whoever has the refcount will have to free it */
55e303ae 802 entryp->flags |= AIO_DO_FREE;
b0d623f7
A
803 aio_entry_unlock(entryp);
804 }
805
806
55e303ae
A
807 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
808 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
809 goto ExitRoutine;
810 }
811 }
812
813 /* look for a match on our queue of active async IO requests */
b0d623f7
A
814 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
815 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae
A
816 if ( entryp->uaiocbp == uap->aiocbp ) {
817 error = EINPROGRESS;
818 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
819 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
820 goto ExitRoutine;
821 }
822 }
823
55e303ae
A
824 error = EINVAL;
825
826ExitRoutine:
b0d623f7
A
827 if (proc_lock_held)
828 aio_proc_unlock(p);
55e303ae
A
829 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
830 (int)p, (int)uap->aiocbp, error, 0, 0 );
831
832 return( error );
833
834} /* aio_return */
835
836
837/*
838 * _aio_exec - internal function used to clean up async IO requests for
839 * a process that is going away due to exec(). We cancel any async IOs
840 * we can and wait for those already active. We also disable signaling
841 * for cancelled or active aio requests that complete.
55e303ae
A
842 * This routine MAY block!
843 */
55e303ae 844__private_extern__ void
2d21ac55 845_aio_exec(proc_t p )
55e303ae
A
846{
847
848 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
849 (int)p, 0, 0, 0, 0 );
850
851 _aio_exit( p );
852
853 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
854 (int)p, 0, 0, 0, 0 );
855
856 return;
857
858} /* _aio_exec */
859
860
861/*
862 * _aio_exit - internal function used to clean up async IO requests for
863 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
864 * we can and wait for those already active. We also disable signaling
865 * for cancelled or active aio requests that complete. This routine MAY block!
55e303ae 866 */
55e303ae 867__private_extern__ void
2d21ac55 868_aio_exit(proc_t p )
55e303ae 869{
b0d623f7 870 int error;
55e303ae
A
871 aio_workq_entry *entryp;
872
b0d623f7 873
55e303ae 874 /* quick check to see if there are any async IO requests queued up */
b0d623f7 875 if (aio_get_all_queues_count() < 1) {
55e303ae
A
876 return;
877 }
878
879 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
880 (int)p, 0, 0, 0, 0 );
881
b0d623f7
A
882 aio_proc_lock(p);
883
55e303ae
A
884 /*
885 * cancel async IO requests on the todo work queue and wait for those
886 * already active to complete.
887 */
b0d623f7
A
888 error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
889 ASSERT_AIO_PROC_LOCK_OWNED(p);
55e303ae
A
890 if ( error == AIO_NOTCANCELED ) {
891 /*
892 * AIO_NOTCANCELED is returned when we find an aio request for this process
893 * on the active async IO queue. Active requests cannot be cancelled so we
894 * must wait for them to complete. We will get a special wake up call on
895 * our channel used to sleep for ALL active requests to complete. This sleep
896 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
897 * active aio requests.
898 */
899
900 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
901 (int)p, 0, 0, 0, 0 );
902
b0d623f7
A
903 while (p->p_aio_active_count != 0) {
904 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
905 }
906 }
907
908 if (p->p_aio_active_count != 0) {
909 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
55e303ae
A
910 }
911
912 /* release all aio resources used by this process */
b0d623f7 913 entryp = TAILQ_FIRST( &p->p_aio_doneq );
55e303ae 914 while ( entryp != NULL ) {
b0d623f7 915 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae
A
916 aio_workq_entry *next_entryp;
917
b0d623f7
A
918 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
919 aio_proc_remove_done_locked(p, entryp);
55e303ae
A
920
921 /* we cannot free requests that are still completing */
b0d623f7
A
922 aio_entry_lock_spin(entryp);
923 if (entryp->aio_refcount == 0) {
924 aio_proc_unlock(p);
925 aio_entry_unlock(entryp);
926 aio_free_request(entryp);
55e303ae
A
927
928 /* need to start over since aio_doneq may have been */
929 /* changed while we were away. */
b0d623f7
A
930 aio_proc_lock(p);
931 entryp = TAILQ_FIRST( &p->p_aio_doneq );
55e303ae
A
932 continue;
933 }
b0d623f7
A
934 else {
935 /* whoever has the reference will have to do the free */
55e303ae 936 entryp->flags |= AIO_DO_FREE;
b0d623f7
A
937 }
938
939 aio_entry_unlock(entryp);
55e303ae
A
940 entryp = next_entryp;
941 }
b0d623f7
A
942
943 aio_proc_unlock(p);
944
55e303ae
A
945 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
946 (int)p, 0, 0, 0, 0 );
55e303ae
A
947 return;
948
949} /* _aio_exit */
950
951
b0d623f7
A
952static boolean_t
953should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
954{
955 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
956 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
957 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
958 return TRUE;
959 }
960
961 return FALSE;
962}
963
55e303ae 964/*
b0d623f7 965 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
55e303ae
A
966 * aio_cancel, close, and at exit.
967 * There are three modes of operation: 1) cancel all async IOs for a process -
968 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
969 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
970 * aiocbp.
971 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
972 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
973 * target async IO requests, and AIO_ALLDONE if all target async IO requests
974 * were already complete.
975 * WARNING - do not deference aiocbp in this routine, it may point to user
976 * land data that has not been copied in (when called from aio_cancel() )
b0d623f7
A
977 *
978 * Called with proc locked, and returns the same way.
55e303ae 979 */
55e303ae 980static int
b0d623f7
A
981do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
982 int wait_for_completion, boolean_t disable_notification )
55e303ae 983{
b0d623f7
A
984 ASSERT_AIO_PROC_LOCK_OWNED(p);
985
55e303ae
A
986 aio_workq_entry *entryp;
987 int result;
988
989 result = -1;
990
991 /* look for a match on our queue of async todo work. */
b0d623f7 992 entryp = TAILQ_FIRST(&p->p_aio_activeq);
55e303ae 993 while ( entryp != NULL ) {
b0d623f7 994 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae 995 aio_workq_entry *next_entryp;
55e303ae 996
b0d623f7
A
997 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
998 if (!should_cancel(entryp, aiocbp, fd)) {
999 entryp = next_entryp;
1000 continue;
55e303ae 1001 }
b0d623f7
A
1002
1003 /* Can only be cancelled if it's still on a work queue */
1004 if (aio_entry_try_workq_remove(entryp) != 0) {
1005 /* Have removed from workq. Update entry state and take a ref */
1006 aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
1007
1008 /* Put on the proc done queue and update counts, then unlock the proc */
1009 aio_proc_move_done_locked(p, entryp);
1010 aio_proc_unlock(p);
1011
1012 /* Now it's officially cancelled. Do the completion */
1013 result = AIO_CANCELED;
1014 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1015 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1016 do_aio_completion(entryp);
1017
1018 /* This will free if the aio_return() has already happened ... */
1019 aio_entry_unref(entryp);
1020 aio_proc_lock(p);
1021
1022 if ( aiocbp != USER_ADDR_NULL ) {
1023 return( result );
55e303ae 1024 }
55e303ae 1025
b0d623f7
A
1026 /*
1027 * Restart from the head of the proc active queue since it
1028 * may have been changed while we were away doing completion
1029 * processing.
1030 *
1031 * Note that if we found an uncancellable AIO before, we will
1032 * either find it again or discover that it's been completed,
1033 * so resetting the result will not cause us to return success
1034 * despite outstanding AIOs.
1035 */
1036 entryp = TAILQ_FIRST(&p->p_aio_activeq);
1037 result = -1; /* As if beginning anew */
1038 } else {
1039 /*
1040 * It's been taken off the active queue already, i.e. is in flight.
1041 * All we can do is ask for notification.
1042 */
55e303ae
A
1043 result = AIO_NOTCANCELED;
1044
1045 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
b0d623f7
A
1046 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1047
1048 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1049 aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
55e303ae 1050
91447636 1051 if ( aiocbp != USER_ADDR_NULL ) {
55e303ae
A
1052 return( result );
1053 }
b0d623f7 1054 entryp = next_entryp;
55e303ae 1055 }
b0d623f7
A
1056 } /* while... */
1057
55e303ae
A
1058 /*
1059 * if we didn't find any matches on the todo or active queues then look for a
1060 * match on our queue of async IO requests that have completed and if found
1061 * return AIO_ALLDONE result.
b0d623f7
A
1062 *
1063 * Proc AIO lock is still held.
55e303ae
A
1064 */
1065 if ( result == -1 ) {
b0d623f7
A
1066 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1067 ASSERT_AIO_FROM_PROC(entryp, p);
1068 if (should_cancel(entryp, aiocbp, fd)) {
55e303ae 1069 result = AIO_ALLDONE;
55e303ae 1070 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
b0d623f7 1071 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
55e303ae 1072
91447636 1073 if ( aiocbp != USER_ADDR_NULL ) {
55e303ae
A
1074 return( result );
1075 }
1076 }
1077 }
1078 }
55e303ae
A
1079
1080 return( result );
1081
b0d623f7
A
1082}
1083 /* do_aio_cancel_locked */
55e303ae
A
1084
1085
1086/*
1087 * aio_suspend - suspend the calling thread until at least one of the async
1088 * IO operations referenced by uap->aiocblist has completed, until a signal
1089 * interrupts the function, or uap->timeoutp time interval (optional) has
1090 * passed.
1091 * Returns 0 if one or more async IOs have completed else -1 and errno is
1092 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1093 * woke us up.
1094 */
2d21ac55
A
1095int
1096aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1097{
1098 __pthread_testcancel(1);
1099 return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
1100}
1101
55e303ae
A
1102
1103int
2d21ac55 1104aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
55e303ae
A
1105{
1106 int error;
1107 int i, count;
1108 uint64_t abstime;
91447636 1109 struct user_timespec ts;
55e303ae 1110 aio_workq_entry *entryp;
91447636 1111 user_addr_t *aiocbpp;
55e303ae
A
1112
1113 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1114 (int)p, uap->nent, 0, 0, 0 );
1115
1116 *retval = -1;
1117 abstime = 0;
1118 aiocbpp = NULL;
1119
b0d623f7 1120 count = aio_get_all_queues_count( );
55e303ae
A
1121 if ( count < 1 ) {
1122 error = EINVAL;
1123 goto ExitThisRoutine;
1124 }
1125
91447636 1126 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
55e303ae
A
1127 error = EINVAL;
1128 goto ExitThisRoutine;
1129 }
1130
91447636
A
1131 if ( uap->timeoutp != USER_ADDR_NULL ) {
1132 if ( proc_is64bit(p) ) {
b0d623f7
A
1133 struct user64_timespec temp;
1134 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1135 if ( error == 0 ) {
1136 ts.tv_sec = temp.tv_sec;
1137 ts.tv_nsec = temp.tv_nsec;
1138 }
91447636
A
1139 }
1140 else {
b0d623f7 1141 struct user32_timespec temp;
91447636
A
1142 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1143 if ( error == 0 ) {
1144 ts.tv_sec = temp.tv_sec;
1145 ts.tv_nsec = temp.tv_nsec;
1146 }
1147 }
55e303ae
A
1148 if ( error != 0 ) {
1149 error = EAGAIN;
1150 goto ExitThisRoutine;
1151 }
1152
2d21ac55 1153 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
55e303ae
A
1154 error = EINVAL;
1155 goto ExitThisRoutine;
1156 }
1157
1158 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1159 &abstime );
1160 clock_absolutetime_interval_to_deadline( abstime, &abstime );
1161 }
1162
b0d623f7 1163 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
55e303ae
A
1164 if ( aiocbpp == NULL ) {
1165 error = EAGAIN;
1166 goto ExitThisRoutine;
1167 }
1168
91447636 1169 /* check list of aio requests to see if any have completed */
2d21ac55 1170check_for_our_aiocbp:
b0d623f7 1171 aio_proc_lock_spin(p);
91447636
A
1172 for ( i = 0; i < uap->nent; i++ ) {
1173 user_addr_t aiocbp;
1174
55e303ae
A
1175 /* NULL elements are legal so check for 'em */
1176 aiocbp = *(aiocbpp + i);
91447636 1177 if ( aiocbp == USER_ADDR_NULL )
55e303ae 1178 continue;
91447636 1179
55e303ae 1180 /* return immediately if any aio request in the list is done */
b0d623f7
A
1181 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1182 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae 1183 if ( entryp->uaiocbp == aiocbp ) {
b0d623f7 1184 aio_proc_unlock(p);
55e303ae
A
1185 *retval = 0;
1186 error = 0;
55e303ae
A
1187 goto ExitThisRoutine;
1188 }
1189 }
55e303ae
A
1190 } /* for ( ; i < uap->nent; ) */
1191
1192 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1193 (int)p, uap->nent, 0, 0, 0 );
1194
1195 /*
1196 * wait for an async IO to complete or a signal fires or timeout expires.
1197 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1198 * interrupts us. If an async IO completes before a signal fires or our
91447636 1199 * timeout expires, we get a wakeup call from aio_work_thread().
55e303ae 1200 */
91447636 1201
b0d623f7 1202 error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
7e4a7d39 1203 if ( error == 0 ) {
2d21ac55
A
1204 /*
1205 * got our wakeup call from aio_work_thread().
1206 * Since we can get a wakeup on this channel from another thread in the
1207 * same process we head back up to make sure this is for the correct aiocbp.
1208 * If it is the correct aiocbp we will return from where we do the check
1209 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1210 * else we will fall out and just sleep again.
1211 */
1212 goto check_for_our_aiocbp;
55e303ae 1213 }
7e4a7d39 1214 else if ( error == EWOULDBLOCK ) {
55e303ae
A
1215 /* our timeout expired */
1216 error = EAGAIN;
1217 }
1218 else {
1219 /* we were interrupted */
55e303ae
A
1220 error = EINTR;
1221 }
1222
1223ExitThisRoutine:
1224 if ( aiocbpp != NULL )
1225 FREE( aiocbpp, M_TEMP );
1226
1227 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1228 (int)p, uap->nent, error, 0, 0 );
1229
1230 return( error );
1231
1232} /* aio_suspend */
1233
1234
1235/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1236 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1237 * (uap->aiocbp->aio_buf).
1238 */
1239
1240int
2d21ac55 1241aio_write(proc_t p, struct aio_write_args *uap, int *retval )
55e303ae
A
1242{
1243 int error;
1244
1245 *retval = 0;
1246
1247 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1248 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1249
1250 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1251 if ( error != 0 )
1252 *retval = -1;
1253
1254 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1255 (int)p, (int)uap->aiocbp, error, 0, 0 );
1256
1257 return( error );
1258
1259} /* aio_write */
1260
1261
b0d623f7
A
1262static user_addr_t *
1263aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
55e303ae 1264{
b0d623f7
A
1265 user_addr_t *aiocbpp;
1266 int i, result;
55e303ae 1267
b0d623f7
A
1268 /* we reserve enough space for largest possible pointer size */
1269 MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1270 if ( aiocbpp == NULL )
1271 goto err;
1272
1273 /* copyin our aiocb pointers from list */
1274 result = copyin( aiocblist, aiocbpp,
1275 proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1276 : (nent * sizeof(user32_addr_t)) );
1277 if ( result) {
1278 FREE( aiocbpp, M_TEMP );
1279 aiocbpp = NULL;
1280 goto err;
1281 }
1282
1283 /*
1284 * We depend on a list of user_addr_t's so we need to
1285 * munge and expand when these pointers came from a
1286 * 32-bit process
1287 */
1288 if ( !proc_is64bit(procp) ) {
1289 /* copy from last to first to deal with overlap */
1290 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1291 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1292
1293 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1294 *my_addrp = (user_addr_t) (*my_ptrp);
1295 }
1296 }
1297
1298err:
1299 return (aiocbpp);
1300}
1301
1302
1303static int
1304aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1305{
1306 int result = 0;
1307
1308 if (sigp == USER_ADDR_NULL)
1309 goto out;
1310
1311 /*
1312 * We need to munge aio_sigevent since it contains pointers.
1313 * Since we do not know if sigev_value is an int or a ptr we do
1314 * NOT cast the ptr to a user_addr_t. This means if we send
1315 * this info back to user space we need to remember sigev_value
1316 * was not expanded for the 32-bit case.
1317 *
1318 * Notes: This does NOT affect us since we don't support
1319 * sigev_value yet in the aio context.
1320 */
1321 if ( proc_is64bit(procp) ) {
1322 struct user64_sigevent sigevent64;
1323
1324 result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
1325 if ( result == 0 ) {
1326 sigev->sigev_notify = sigevent64.sigev_notify;
1327 sigev->sigev_signo = sigevent64.sigev_signo;
1328 sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1329 sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1330 sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1331 }
1332
1333 } else {
1334 struct user32_sigevent sigevent32;
1335
1336 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1337 if ( result == 0 ) {
1338 sigev->sigev_notify = sigevent32.sigev_notify;
1339 sigev->sigev_signo = sigevent32.sigev_signo;
1340 sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1341 sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1342 sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1343 }
1344 }
1345
1346 if ( result != 0 ) {
1347 result = EAGAIN;
1348 }
1349
1350out:
1351 return (result);
1352}
1353
1354/*
1355 * aio_enqueue_work
1356 *
1357 * Queue up the entry on the aio asynchronous work queue in priority order
1358 * based on the relative priority of the request. We calculate the relative
1359 * priority using the nice value of the caller and the value
1360 *
1361 * Parameters: procp Process queueing the I/O
1362 * entryp The work queue entry being queued
1363 *
1364 * Returns: (void) No failure modes
1365 *
1366 * Notes: This function is used for both lio_listio and aio
1367 *
1368 * XXX: At some point, we may have to consider thread priority
1369 * rather than process priority, but we don't maintain the
1370 * adjusted priority for threads the POSIX way.
1371 *
1372 *
1373 * Called with proc locked.
1374 */
1375static void
1376aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1377{
1378#if 0
1379 aio_workq_entry *my_entryp; /* used for insertion sort */
1380#endif /* 0 */
1381 aio_workq_t queue = aio_entry_workq(entryp);
1382
1383 if (proc_locked == 0) {
1384 aio_proc_lock(procp);
1385 }
1386
1387 ASSERT_AIO_PROC_LOCK_OWNED(procp);
1388
1389 /* Onto proc queue */
1390 TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
1391 procp->p_aio_active_count++;
1392 procp->p_aio_total_count++;
1393
1394 /* And work queue */
1395 aio_workq_lock_spin(queue);
1396 aio_workq_add_entry_locked(queue, entryp);
3e170ce0
A
1397 waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1398 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
b0d623f7
A
1399 aio_workq_unlock(queue);
1400
1401 if (proc_locked == 0) {
1402 aio_proc_unlock(procp);
1403 }
1404
1405#if 0
1406 /*
1407 * Procedure:
1408 *
1409 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1410 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1411 * which is [0..39], with 0 not being used. In nice values, the
1412 * lower the nice value, the higher the priority.
1413 * (3) The normalized scheduling prioritiy is the highest nice value
1414 * minus the current nice value. In I/O scheduling priority, the
1415 * higher the value the lower the priority, so it is the inverse
1416 * of the nice value (the higher the number, the higher the I/O
1417 * priority).
1418 * (4) From the normalized scheduling priority, we subtract the
1419 * request priority to get the request priority value number;
1420 * this means that requests are only capable of depressing their
1421 * priority relative to other requests,
1422 */
1423 entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1424
1425 /* only premit depressing the priority */
1426 if (entryp->aiocb.aio_reqprio < 0)
1427 entryp->aiocb.aio_reqprio = 0;
1428 if (entryp->aiocb.aio_reqprio > 0) {
1429 entryp->priority -= entryp->aiocb.aio_reqprio;
1430 if (entryp->priority < 0)
1431 entryp->priority = 0;
1432 }
1433
1434 /* Insertion sort the entry; lowest ->priority to highest */
1435 TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1436 if ( entryp->priority <= my_entryp->priority) {
1437 TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1438 break;
1439 }
1440 }
1441 if (my_entryp == NULL)
1442 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1443#endif /* 0 */
1444}
1445
1446
1447/*
1448 * lio_listio - initiate a list of IO requests. We process the list of
1449 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1450 * (mode == LIO_NOWAIT).
1451 *
1452 * The caller gets error and return status for each aiocb in the list
1453 * via aio_error and aio_return. We must keep completed requests until
1454 * released by the aio_return call.
1455 */
1456int
1457lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1458{
1459 int i;
1460 int call_result;
1461 int result;
1462 int old_count;
1463 aio_workq_entry **entryp_listp;
1464 user_addr_t *aiocbpp;
1465 struct user_sigevent aiosigev;
1466 aio_lio_context *lio_context;
1467 boolean_t free_context = FALSE;
5ba3f43e
A
1468 uint32_t *paio_offset;
1469 uint32_t *paio_nbytes;
b0d623f7
A
1470
1471 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1472 (int)p, uap->nent, uap->mode, 0, 0 );
55e303ae
A
1473
1474 entryp_listp = NULL;
b0d623f7 1475 lio_context = NULL;
91447636 1476 aiocbpp = NULL;
55e303ae
A
1477 call_result = -1;
1478 *retval = -1;
1479 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1480 call_result = EINVAL;
1481 goto ExitRoutine;
1482 }
1483
1484 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1485 call_result = EINVAL;
1486 goto ExitRoutine;
1487 }
55e303ae
A
1488
1489 /*
b0d623f7
A
1490 * allocate a list of aio_workq_entry pointers that we will use
1491 * to queue up all our requests at once while holding our lock.
55e303ae 1492 */
91447636 1493 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
55e303ae
A
1494 if ( entryp_listp == NULL ) {
1495 call_result = EAGAIN;
1496 goto ExitRoutine;
1497 }
b0d623f7
A
1498
1499 MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1500 if ( lio_context == NULL ) {
91447636
A
1501 call_result = EAGAIN;
1502 goto ExitRoutine;
1503 }
1504
b0d623f7
A
1505#if DEBUG
1506 OSIncrementAtomic(&lio_contexts_alloced);
1507#endif /* DEBUG */
1508
d9a64523 1509 free_context = TRUE;
b0d623f7
A
1510 bzero(lio_context, sizeof(aio_lio_context));
1511
1512 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1513 if ( aiocbpp == NULL ) {
91447636
A
1514 call_result = EAGAIN;
1515 goto ExitRoutine;
1516 }
b0d623f7
A
1517
1518 /*
1519 * Use sigevent passed in to lio_listio for each of our calls, but
1520 * only do completion notification after the last request completes.
1521 */
1522 bzero(&aiosigev, sizeof(aiosigev));
1523 /* Only copy in an sigev if the user supplied one */
1524 if (uap->sigp != USER_ADDR_NULL) {
1525 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1526 if ( call_result)
1527 goto ExitRoutine;
91447636
A
1528 }
1529
55e303ae 1530 /* process list of aio requests */
d9a64523 1531 free_context = FALSE;
b0d623f7
A
1532 lio_context->io_issued = uap->nent;
1533 lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
55e303ae 1534 for ( i = 0; i < uap->nent; i++ ) {
91447636 1535 user_addr_t my_aiocbp;
b0d623f7 1536 aio_workq_entry *entryp;
55e303ae
A
1537
1538 *(entryp_listp + i) = NULL;
91447636 1539 my_aiocbp = *(aiocbpp + i);
55e303ae 1540
55e303ae 1541 /* NULL elements are legal so check for 'em */
b0d623f7
A
1542 if ( my_aiocbp == USER_ADDR_NULL ) {
1543 aio_proc_lock_spin(p);
1544 lio_context->io_issued--;
1545 aio_proc_unlock(p);
55e303ae 1546 continue;
b0d623f7 1547 }
55e303ae 1548
b0d623f7
A
1549 /*
1550 * We use lio_context to mark IO requests for delayed completion
1551 * processing which means we wait until all IO requests in the
1552 * group have completed before we either return to the caller
1553 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1554 *
1555 * We use the address of the lio_context for this, since it is
1556 * unique in the address space.
1557 */
1558 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
55e303ae
A
1559 if ( result != 0 && call_result == -1 )
1560 call_result = result;
55e303ae
A
1561
1562 /* NULL elements are legal so check for 'em */
1563 entryp = *(entryp_listp + i);
b0d623f7
A
1564 if ( entryp == NULL ) {
1565 aio_proc_lock_spin(p);
1566 lio_context->io_issued--;
1567 aio_proc_unlock(p);
55e303ae 1568 continue;
b0d623f7
A
1569 }
1570
1571 if ( uap->mode == LIO_NOWAIT ) {
1572 /* Set signal hander, if any */
1573 entryp->aiocb.aio_sigevent = aiosigev;
1574 } else {
1575 /* flag that this thread blocks pending completion */
1576 entryp->flags |= AIO_LIO_NOTIFY;
1577 }
55e303ae
A
1578
1579 /* check our aio limits to throttle bad or rude user land behavior */
b0d623f7
A
1580 old_count = aio_increment_total_count();
1581
1582 aio_proc_lock_spin(p);
1583 if ( old_count >= aio_max_requests ||
55e303ae
A
1584 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1585 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
55e303ae 1586
b0d623f7
A
1587 lio_context->io_issued--;
1588 aio_proc_unlock(p);
1589
1590 aio_decrement_total_count();
1591
91447636 1592 if ( call_result == -1 )
b0d623f7
A
1593 call_result = EAGAIN;
1594 aio_free_request(entryp);
1595 entryp_listp[i] = NULL;
55e303ae
A
1596 continue;
1597 }
1598
b0d623f7
A
1599 lck_mtx_convert_spin(aio_proc_mutex(p));
1600 aio_enqueue_work(p, entryp, 1);
1601 aio_proc_unlock(p);
1602
5ba3f43e
A
1603 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_START,
1604 (int)p, (int)entryp->uaiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
1605 paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
1606 paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
1607 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_END,
1608 paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
1609 paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
1610 0 );
1611 }
55e303ae 1612
b0d623f7
A
1613 switch(uap->mode) {
1614 case LIO_WAIT:
1615 aio_proc_lock_spin(p);
1616 while (lio_context->io_completed < lio_context->io_issued) {
1617 result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
55e303ae 1618
b0d623f7
A
1619 /* If we were interrupted, fail out (even if all finished) */
1620 if (result != 0) {
1621 call_result = EINTR;
1622 lio_context->io_waiter = 0;
1623 break;
1624 }
1625 }
1626
1627 /* If all IOs have finished must free it */
1628 if (lio_context->io_completed == lio_context->io_issued) {
1629 free_context = TRUE;
1630 }
55e303ae 1631
b0d623f7
A
1632 aio_proc_unlock(p);
1633 break;
1634
1635 case LIO_NOWAIT:
1636 break;
1637 }
1638
55e303ae
A
1639 /* call_result == -1 means we had no trouble queueing up requests */
1640 if ( call_result == -1 ) {
1641 call_result = 0;
1642 *retval = 0;
1643 }
1644
1645ExitRoutine:
1646 if ( entryp_listp != NULL )
1647 FREE( entryp_listp, M_TEMP );
91447636
A
1648 if ( aiocbpp != NULL )
1649 FREE( aiocbpp, M_TEMP );
d9a64523 1650 if (free_context) {
b0d623f7
A
1651 free_lio_context(lio_context);
1652 }
1653
55e303ae
A
1654 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1655 (int)p, call_result, 0, 0, 0 );
1656
1657 return( call_result );
1658
1659} /* lio_listio */
1660
1661
1662/*
1663 * aio worker thread. this is where all the real work gets done.
1664 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1665 * after new work is queued up.
1666 */
39037602 1667__attribute__((noreturn))
55e303ae 1668static void
39037602 1669aio_work_thread(void)
55e303ae
A
1670{
1671 aio_workq_entry *entryp;
b0d623f7
A
1672 int error;
1673 vm_map_t currentmap;
1674 vm_map_t oldmap = VM_MAP_NULL;
1675 task_t oldaiotask = TASK_NULL;
1676 struct uthread *uthreadp = NULL;
55e303ae
A
1677
1678 for( ;; ) {
b0d623f7
A
1679 /*
1680 * returns with the entry ref'ed.
1681 * sleeps until work is available.
1682 */
1683 entryp = aio_get_some_work();
1684
1685 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1686 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1687
1688 /*
1689 * Assume the target's address space identity for the duration
1690 * of the IO. Note: don't need to have the entryp locked,
1691 * because the proc and map don't change until it's freed.
1692 */
1693 currentmap = get_task_map( (current_proc())->task );
1694 if ( currentmap != entryp->aio_map ) {
1695 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1696 oldaiotask = uthreadp->uu_aio_task;
1697 uthreadp->uu_aio_task = entryp->procp->task;
1698 oldmap = vm_map_switch( entryp->aio_map );
1699 }
1700
1701 if ( (entryp->flags & AIO_READ) != 0 ) {
1702 error = do_aio_read( entryp );
1703 }
1704 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1705 error = do_aio_write( entryp );
1706 }
1707 else if ( (entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0 ) {
1708 error = do_aio_fsync( entryp );
1709 }
55e303ae 1710 else {
b0d623f7
A
1711 printf( "%s - unknown aio request - flags 0x%02X \n",
1712 __FUNCTION__, entryp->flags );
1713 error = EINVAL;
1714 }
91447636 1715
b0d623f7
A
1716 /* Restore old map */
1717 if ( currentmap != entryp->aio_map ) {
1718 (void) vm_map_switch( oldmap );
1719 uthreadp->uu_aio_task = oldaiotask;
1720 }
55e303ae 1721
b0d623f7
A
1722 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1723 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1724 entryp->returnval, 0 );
1725
1726
1727 /* XXX COUNTS */
1728 aio_entry_lock_spin(entryp);
1729 entryp->errorval = error;
1730 aio_entry_unlock(entryp);
1731
1732 /* we're done with the IO request so pop it off the active queue and */
1733 /* push it on the done queue */
1734 aio_proc_lock(entryp->procp);
1735 aio_proc_move_done_locked(entryp->procp, entryp);
1736 aio_proc_unlock(entryp->procp);
1737
1738 OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1739
1740 /* remove our reference to the user land map. */
1741 if ( VM_MAP_NULL != entryp->aio_map ) {
1742 vm_map_t my_map;
1743
1744 my_map = entryp->aio_map;
1745 entryp->aio_map = VM_MAP_NULL;
1746 vm_map_deallocate( my_map );
55e303ae 1747 }
b0d623f7
A
1748
1749 /* Provide notifications */
1750 do_aio_completion( entryp );
1751
1752 /* Will free if needed */
1753 aio_entry_unref(entryp);
1754
55e303ae
A
1755 } /* for ( ;; ) */
1756
1757 /* NOT REACHED */
1758
1759} /* aio_work_thread */
1760
1761
1762/*
1763 * aio_get_some_work - get the next async IO request that is ready to be executed.
1764 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1765 * IO requests at the time the aio_fsync call came in have completed.
91447636 1766 * NOTE - AIO_LOCK must be held by caller
55e303ae 1767 */
55e303ae
A
1768static aio_workq_entry *
1769aio_get_some_work( void )
1770{
b0d623f7
A
1771 aio_workq_entry *entryp = NULL;
1772 aio_workq_t queue = NULL;
1773
1774 /* Just one queue for the moment. In the future there will be many. */
1775 queue = &aio_anchor.aio_async_workqs[0];
1776 aio_workq_lock_spin(queue);
1777 if (queue->aioq_count == 0) {
1778 goto nowork;
1779 }
1780
1781 /*
1782 * Hold the queue lock.
1783 *
1784 * pop some work off the work queue and add to our active queue
1785 * Always start with the queue lock held.
1786 */
1787 for(;;) {
1788 /*
1789 * Pull of of work queue. Once it's off, it can't be cancelled,
1790 * so we can take our ref once we drop the queue lock.
1791 */
1792 entryp = TAILQ_FIRST(&queue->aioq_entries);
55e303ae 1793
b0d623f7
A
1794 /*
1795 * If there's no work or only fsyncs that need delay, go to sleep
1796 * and then start anew from aio_work_thread
1797 */
1798 if (entryp == NULL) {
1799 goto nowork;
1800 }
1801
1802 aio_workq_remove_entry_locked(queue, entryp);
1803
1804 aio_workq_unlock(queue);
1805
1806 /*
1807 * Check if it's an fsync that must be delayed. No need to lock the entry;
1808 * that flag would have been set at initialization.
1809 */
55e303ae 1810 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
b0d623f7
A
1811 /*
1812 * Check for unfinished operations on the same file
1813 * in this proc's queue.
1814 */
1815 aio_proc_lock_spin(entryp->procp);
55e303ae 1816 if ( aio_delay_fsync_request( entryp ) ) {
b0d623f7 1817 /* It needs to be delayed. Put it back on the end of the work queue */
55e303ae
A
1818 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1819 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
b0d623f7
A
1820
1821 aio_proc_unlock(entryp->procp);
1822
1823 aio_workq_lock_spin(queue);
1824 aio_workq_add_entry_locked(queue, entryp);
55e303ae 1825 continue;
b0d623f7
A
1826 }
1827 aio_proc_unlock(entryp->procp);
55e303ae 1828 }
b0d623f7 1829
55e303ae
A
1830 break;
1831 }
b0d623f7
A
1832
1833 aio_entry_ref(entryp);
1834
1835 OSIncrementAtomic(&aio_anchor.aio_inflight_count);
55e303ae 1836 return( entryp );
55e303ae 1837
b0d623f7
A
1838nowork:
1839 /* We will wake up when someone enqueues something */
3e170ce0 1840 waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
b0d623f7
A
1841 aio_workq_unlock(queue);
1842 thread_block( (thread_continue_t)aio_work_thread );
1843
1844 // notreached
1845 return NULL;
1846}
55e303ae
A
1847
1848/*
b0d623f7
A
1849 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1850 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1851 * not been completed.
55e303ae
A
1852 */
1853static boolean_t
1854aio_delay_fsync_request( aio_workq_entry *entryp )
1855{
b0d623f7
A
1856 if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1857 return FALSE;
55e303ae
A
1858 }
1859
b0d623f7 1860 return TRUE;
55e303ae
A
1861} /* aio_delay_fsync_request */
1862
b0d623f7
A
1863static aio_workq_entry *
1864aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
55e303ae 1865{
b0d623f7
A
1866 aio_workq_entry *entryp;
1867 int result = 0;
55e303ae
A
1868
1869 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1870 if ( entryp == NULL ) {
1871 result = EAGAIN;
1872 goto error_exit;
1873 }
91447636 1874
55e303ae
A
1875 bzero( entryp, sizeof(*entryp) );
1876
1877 /* fill in the rest of the aio_workq_entry */
1878 entryp->procp = procp;
1879 entryp->uaiocbp = aiocbp;
b0d623f7 1880 entryp->flags |= kindOfIO;
55e303ae
A
1881 entryp->group_tag = group_tag;
1882 entryp->aio_map = VM_MAP_NULL;
b0d623f7 1883 entryp->aio_refcount = 0;
91447636 1884
b0d623f7
A
1885 if ( proc_is64bit(procp) ) {
1886 struct user64_aiocb aiocb64;
1887
1888 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
1889 if (result == 0 )
1890 do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1891
1892 } else {
1893 struct user32_aiocb aiocb32;
1894
91447636
A
1895 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1896 if ( result == 0 )
b0d623f7 1897 do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
55e303ae
A
1898 }
1899
b0d623f7
A
1900 if ( result != 0 ) {
1901 result = EAGAIN;
55e303ae 1902 goto error_exit;
b0d623f7 1903 }
55e303ae
A
1904
1905 /* get a reference to the user land map in order to keep it around */
1906 entryp->aio_map = get_task_map( procp->task );
1907 vm_map_reference( entryp->aio_map );
b0d623f7
A
1908
1909 /* do some more validation on the aiocb and embedded file descriptor */
1910 result = aio_validate( entryp );
39236c6e
A
1911 if ( result != 0 )
1912 goto error_exit_with_ref;
1913
1914 /* get a reference on the current_thread, which is passed in vfs_context. */
1915 entryp->thread = current_thread();
1916 thread_reference( entryp->thread );
1917 return ( entryp );
b0d623f7 1918
39236c6e
A
1919error_exit_with_ref:
1920 if ( VM_MAP_NULL != entryp->aio_map ) {
1921 vm_map_deallocate( entryp->aio_map );
1922 }
55e303ae 1923error_exit:
b0d623f7 1924 if ( result && entryp != NULL ) {
91447636 1925 zfree( aio_workq_zonep, entryp );
b0d623f7
A
1926 entryp = NULL;
1927 }
1928
1929 return ( entryp );
1930}
55e303ae
A
1931
1932
1933/*
b0d623f7
A
1934 * aio_queue_async_request - queue up an async IO request on our work queue then
1935 * wake up one of our worker threads to do the actual work. We get a reference
1936 * to our caller's user land map in order to keep it around while we are
1937 * processing the request.
55e303ae 1938 */
b0d623f7
A
1939static int
1940aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
55e303ae 1941{
b0d623f7 1942 aio_workq_entry *entryp;
5ba3f43e
A
1943 int result;
1944 int old_count;
1945 uint32_t *paio_offset;
1946 uint32_t *paio_nbytes;
1947
b0d623f7
A
1948 old_count = aio_increment_total_count();
1949 if (old_count >= aio_max_requests) {
1950 result = EAGAIN;
1951 goto error_noalloc;
55e303ae 1952 }
b0d623f7
A
1953
1954 entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1955 if ( entryp == NULL ) {
1956 result = EAGAIN;
1957 goto error_noalloc;
55e303ae 1958 }
55e303ae
A
1959
1960
b0d623f7 1961 aio_proc_lock_spin(procp);
55e303ae 1962
b0d623f7
A
1963 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1964 result = EAGAIN;
1965 goto error_exit;
1966 }
55e303ae 1967
b0d623f7
A
1968 /* check our aio limits to throttle bad or rude user land behavior */
1969 if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
1970 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
55e303ae
A
1971 result = EAGAIN;
1972 goto error_exit;
1973 }
b0d623f7
A
1974
1975 /* Add the IO to proc and work queues, wake up threads as appropriate */
1976 lck_mtx_convert_spin(aio_proc_mutex(procp));
1977 aio_enqueue_work(procp, entryp, 1);
1978
1979 aio_proc_unlock(procp);
5ba3f43e
A
1980
1981 paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
1982 paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
1983 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_START,
1984 (int)procp, (int)aiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
1985 KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_END,
1986 paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
1987 paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
1988 0 );
1989
b0d623f7
A
1990 return( 0 );
1991
1992error_exit:
1993 /*
1994 * This entry has not been queued up so no worries about
1995 * unlocked state and aio_map
1996 */
1997 aio_proc_unlock(procp);
1998 aio_free_request(entryp);
91447636 1999
b0d623f7
A
2000error_noalloc:
2001 aio_decrement_total_count();
91447636 2002
b0d623f7
A
2003 return( result );
2004
2005} /* aio_queue_async_request */
91447636 2006
b0d623f7
A
2007
2008/*
2009 * lio_create_entry
2010 *
2011 * Allocate an aio_workq_entry and fill it in. If all goes well return 0
2012 * and pass the aio_workq_entry pointer back to our caller.
2013 *
2014 * Parameters: procp The process makign the request
2015 * aiocbp The aio context buffer pointer
2016 * group_tag The group tag used to indicate a
2017 * group of operations has completed
2018 * entrypp Pointer to the pointer to receive the
2019 * address of the created aio_workq_entry
2020 *
2021 * Returns: 0 Successfully created
2022 * EAGAIN Try again (usually resource shortage)
2023 *
2024 *
2025 * Notes: We get a reference to our caller's user land map in order
2026 * to keep it around while we are processing the request.
2027 *
2028 * lio_listio calls behave differently at completion they do
2029 * completion notification when all async IO requests have
2030 * completed. We use group_tag to tag IO requests that behave
2031 * in the delay notification manner.
2032 *
2033 * All synchronous operations are considered to not have a
2034 * signal routine associated with them (sigp == USER_ADDR_NULL).
2035 */
2036static int
2037lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2038 aio_workq_entry **entrypp )
2039{
2040 aio_workq_entry *entryp;
2041 int result;
2042
2043 entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2044 if ( entryp == NULL ) {
2045 result = EAGAIN;
55e303ae
A
2046 goto error_exit;
2047 }
2048
b0d623f7
A
2049 /*
2050 * Look for lio_listio LIO_NOP requests and ignore them; this is
2051 * not really an error, but we need to free our aio_workq_entry.
2052 */
55e303ae
A
2053 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
2054 result = 0;
2055 goto error_exit;
2056 }
2057
55e303ae
A
2058 *entrypp = entryp;
2059 return( 0 );
2060
2061error_exit:
b0d623f7
A
2062
2063 if ( entryp != NULL ) {
2064 /*
2065 * This entry has not been queued up so no worries about
2066 * unlocked state and aio_map
2067 */
2068 aio_free_request(entryp);
2069 }
55e303ae
A
2070
2071 return( result );
2072
b0d623f7 2073} /* lio_create_entry */
55e303ae
A
2074
2075
2076/*
2077 * aio_free_request - remove our reference on the user land map and
b0d623f7
A
2078 * free the work queue entry resources. The entry is off all lists
2079 * and has zero refcount, so no one can have a pointer to it.
55e303ae
A
2080 */
2081
2082static int
b0d623f7 2083aio_free_request(aio_workq_entry *entryp)
55e303ae
A
2084{
2085 /* remove our reference to the user land map. */
b0d623f7
A
2086 if ( VM_MAP_NULL != entryp->aio_map) {
2087 vm_map_deallocate(entryp->aio_map);
55e303ae 2088 }
b0d623f7 2089
39236c6e
A
2090 /* remove our reference to thread which enqueued the request */
2091 if ( NULL != entryp->thread ) {
2092 thread_deallocate( entryp->thread );
2093 }
2094
b0d623f7
A
2095 entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2096
91447636 2097 zfree( aio_workq_zonep, entryp );
55e303ae
A
2098
2099 return( 0 );
2100
2101} /* aio_free_request */
2102
2103
b0d623f7
A
2104/*
2105 * aio_validate
2106 *
2107 * validate the aiocb passed in by one of the aio syscalls.
55e303ae 2108 */
55e303ae
A
2109static int
2110aio_validate( aio_workq_entry *entryp )
2111{
91447636 2112 struct fileproc *fp;
55e303ae
A
2113 int flag;
2114 int result;
2115
2116 result = 0;
2117
2118 if ( (entryp->flags & AIO_LIO) != 0 ) {
2119 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
2120 entryp->flags |= AIO_READ;
2121 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
2122 entryp->flags |= AIO_WRITE;
2123 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
2124 return( 0 );
2125 else
2126 return( EINVAL );
2127 }
2128
2129 flag = FREAD;
b0d623f7 2130 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0 ) {
55e303ae
A
2131 flag = FWRITE;
2132 }
2133
2134 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
91447636
A
2135 if ( entryp->aiocb.aio_nbytes > INT_MAX ||
2136 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2137 entryp->aiocb.aio_offset < 0 )
55e303ae
A
2138 return( EINVAL );
2139 }
2140
b0d623f7
A
2141 /*
2142 * validate aiocb.aio_sigevent. at this point we only support
2143 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
2144 * sigev_value, sigev_notify_function, and sigev_notify_attributes
2145 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
2146 * with no [RTS] (RalTime Signal) option group support.
55e303ae 2147 */
b0d623f7
A
2148 switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
2149 case SIGEV_SIGNAL:
2150 {
55e303ae 2151 int signum;
b0d623f7 2152
55e303ae
A
2153 /* make sure we have a valid signal number */
2154 signum = entryp->aiocb.aio_sigevent.sigev_signo;
2155 if ( signum <= 0 || signum >= NSIG ||
2156 signum == SIGKILL || signum == SIGSTOP )
2157 return (EINVAL);
b0d623f7
A
2158 }
2159 break;
2160
2161 case SIGEV_NONE:
2162 break;
2163
2164 case SIGEV_THREAD:
2165 /* Unsupported [RTS] */
2166
2167 default:
55e303ae 2168 return (EINVAL);
b0d623f7 2169 }
55e303ae
A
2170
2171 /* validate the file descriptor and that the file was opened
91447636 2172 * for the appropriate read / write access.
55e303ae 2173 */
91447636 2174 proc_fdlock(entryp->procp);
55e303ae 2175
91447636 2176 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
55e303ae 2177 if ( result == 0 ) {
91447636 2178 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
55e303ae
A
2179 /* we don't have read or write access */
2180 result = EBADF;
2181 }
39236c6e 2182 else if ( FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE ) {
55e303ae
A
2183 /* this is not a file */
2184 result = ESPIPE;
91447636
A
2185 } else
2186 fp->f_flags |= FP_AIOISSUED;
2187
2188 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
55e303ae
A
2189 }
2190 else {
2191 result = EBADF;
2192 }
2193
91447636 2194 proc_fdunlock(entryp->procp);
55e303ae
A
2195
2196 return( result );
2197
2198} /* aio_validate */
2199
b0d623f7
A
2200static int
2201aio_increment_total_count()
2202{
2203 return OSIncrementAtomic(&aio_anchor.aio_total_count);
2204}
2205
2206static int
2207aio_decrement_total_count()
2208{
2209 int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2210 if (old <= 0) {
2211 panic("Negative total AIO count!\n");
2212 }
55e303ae 2213
b0d623f7
A
2214 return old;
2215}
55e303ae
A
2216
2217static int
2d21ac55 2218aio_get_process_count(proc_t procp )
55e303ae 2219{
b0d623f7 2220 return procp->p_aio_total_count;
55e303ae
A
2221
2222} /* aio_get_process_count */
2223
55e303ae
A
2224static int
2225aio_get_all_queues_count( void )
2226{
b0d623f7 2227 return aio_anchor.aio_total_count;
55e303ae
A
2228
2229} /* aio_get_all_queues_count */
2230
2231
2232/*
2233 * do_aio_completion. Handle async IO completion.
2234 */
55e303ae
A
2235static void
2236do_aio_completion( aio_workq_entry *entryp )
2237{
b0d623f7
A
2238
2239 boolean_t lastLioCompleted = FALSE;
2240 aio_lio_context *lio_context = NULL;
2241 int waiter = 0;
2242
2243 lio_context = (aio_lio_context *)entryp->group_tag;
2244
2245 if (lio_context != NULL) {
2246
2247 aio_proc_lock_spin(entryp->procp);
2248
2249 /* Account for this I/O completing. */
2250 lio_context->io_completed++;
2251
2252 /* Are we done with this lio context? */
2253 if (lio_context->io_issued == lio_context->io_completed) {
2254 lastLioCompleted = TRUE;
2255 }
2256
2257 waiter = lio_context->io_waiter;
2258
2259 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2260 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2261 /* wake up the waiter */
2262 wakeup(lio_context);
2263 }
2264
2265 aio_proc_unlock(entryp->procp);
2266 }
2267
55e303ae
A
2268 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2269 (entryp->flags & AIO_DISABLE) == 0 ) {
b0d623f7
A
2270
2271 boolean_t performSignal = FALSE;
2272 if (lio_context == NULL) {
2273 performSignal = TRUE;
2274 }
2275 else {
2276 /*
2277 * If this was the last request in the group and a signal
2278 * is desired, send one.
2279 */
2280 performSignal = lastLioCompleted;
2281 }
2282
2283 if (performSignal) {
2284
55e303ae 2285 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
b0d623f7
A
2286 (int)entryp->procp, (int)entryp->uaiocbp,
2287 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
55e303ae
A
2288
2289 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
55e303ae
A
2290 }
2291 }
2292
b0d623f7
A
2293 if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2294 panic("Close and exit flags set at the same time\n");
2295 }
2296
55e303ae 2297 /*
b0d623f7
A
2298 * need to handle case where a process is trying to exit, exec, or
2299 * close and is currently waiting for active aio requests to complete.
2300 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
55e303ae 2301 * other requests in the active queue for this process. If there are
b0d623f7
A
2302 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2303 * If there are some still active then do nothing - we only want to
2304 * wakeup when all active aio requests for the process are complete.
2305 *
2306 * Don't need to lock the entry or proc to check the cleanup flag. It can only be
2307 * set for cancellation, while the entryp is still on a proc list; now it's
2308 * off, so that flag is already set if it's going to be.
55e303ae 2309 */
b0d623f7 2310 if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) {
55e303ae
A
2311 int active_requests;
2312
2313 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2314 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2315
b0d623f7 2316 aio_proc_lock_spin(entryp->procp);
55e303ae 2317 active_requests = aio_active_requests_for_process( entryp->procp );
55e303ae 2318 if ( active_requests < 1 ) {
b0d623f7
A
2319 /*
2320 * no active aio requests for this process, continue exiting. In this
2321 * case, there should be no one else waiting ont he proc in AIO...
2322 */
2323 wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2324 aio_proc_unlock(entryp->procp);
55e303ae
A
2325
2326 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2327 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
b0d623f7
A
2328 } else {
2329 aio_proc_unlock(entryp->procp);
55e303ae 2330 }
55e303ae 2331 }
b0d623f7
A
2332
2333 if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) {
2334 int active_requests;
55e303ae 2335
b0d623f7
A
2336 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2337 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2338
2339 aio_proc_lock_spin(entryp->procp);
2340 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2341 if ( active_requests < 1 ) {
2342 /* Can't wakeup_one(); multiple closes might be in progress. */
2343 wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2344 aio_proc_unlock(entryp->procp);
2345
2346 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2347 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2348 } else {
2349 aio_proc_unlock(entryp->procp);
2350 }
2351 }
55e303ae 2352 /*
b0d623f7
A
2353 * A thread in aio_suspend() wants to known about completed IOs. If it checked
2354 * the done list before we moved our AIO there, then it already asserted its wait,
2355 * and we can wake it up without holding the lock. If it checked the list after
2356 * we did our move, then it already has seen the AIO that we moved. Herego, we
2357 * can do our wakeup without holding the lock.
55e303ae 2358 */
b0d623f7 2359 wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
55e303ae
A
2360 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2361 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
55e303ae 2362
b0d623f7
A
2363 /*
2364 * free the LIO context if the last lio completed and no thread is
2365 * waiting
2366 */
2367 if (lastLioCompleted && (waiter == 0))
2368 free_lio_context (lio_context);
55e303ae 2369
55e303ae 2370
b0d623f7 2371} /* do_aio_completion */
55e303ae
A
2372
2373
2374/*
2375 * do_aio_read
2376 */
2377static int
2378do_aio_read( aio_workq_entry *entryp )
2379{
2d21ac55
A
2380 struct fileproc *fp;
2381 int error;
2382 struct vfs_context context;
55e303ae 2383
91447636
A
2384 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2385 return(error);
2386 if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2387 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2388 return(EBADF);
2389 }
2d21ac55 2390
39236c6e 2391 context.vc_thread = entryp->thread; /* XXX */
2d21ac55
A
2392 context.vc_ucred = fp->f_fglob->fg_cred;
2393
2394 error = dofileread(&context, fp,
2395 entryp->aiocb.aio_buf,
2396 entryp->aiocb.aio_nbytes,
2397 entryp->aiocb.aio_offset, FOF_OFFSET,
2398 &entryp->returnval);
2399 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
55e303ae
A
2400
2401 return( error );
2402
2403} /* do_aio_read */
2404
2405
2406/*
2407 * do_aio_write
2408 */
2409static int
2410do_aio_write( aio_workq_entry *entryp )
2411{
91447636 2412 struct fileproc *fp;
b0d623f7 2413 int error, flags;
2d21ac55 2414 struct vfs_context context;
55e303ae 2415
91447636
A
2416 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2417 return(error);
2418 if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2419 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2420 return(EBADF);
2421 }
2d21ac55 2422
b0d623f7
A
2423 flags = FOF_PCRED;
2424 if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) {
2425 flags |= FOF_OFFSET;
2426 }
2427
39236c6e 2428 context.vc_thread = entryp->thread; /* XXX */
2d21ac55
A
2429 context.vc_ucred = fp->f_fglob->fg_cred;
2430
2431 /* NB: tell dofilewrite the offset, and to use the proc cred */
2432 error = dofilewrite(&context,
2433 fp,
2434 entryp->aiocb.aio_buf,
2435 entryp->aiocb.aio_nbytes,
2436 entryp->aiocb.aio_offset,
b0d623f7 2437 flags,
2d21ac55 2438 &entryp->returnval);
fe8ab488
A
2439
2440 if (entryp->returnval)
2441 fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
2442 else
2443 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
55e303ae
A
2444
2445 return( error );
2446
2447} /* do_aio_write */
2448
2449
2450/*
2451 * aio_active_requests_for_process - return number of active async IO
2452 * requests for the given process.
55e303ae 2453 */
55e303ae 2454static int
2d21ac55 2455aio_active_requests_for_process(proc_t procp )
55e303ae 2456{
b0d623f7
A
2457 return( procp->p_aio_active_count );
2458
2459} /* aio_active_requests_for_process */
2460
2461/*
2462 * Called with the proc locked.
2463 */
2464static int
2465aio_proc_active_requests_for_file(proc_t procp, int fd)
2466{
2467 int count = 0;
2468 aio_workq_entry *entryp;
2469 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2470 if (entryp->aiocb.aio_fildes == fd) {
2471 count++;
2472 }
2473 }
55e303ae 2474
b0d623f7 2475 return count;
55e303ae
A
2476} /* aio_active_requests_for_process */
2477
2478
b0d623f7 2479
55e303ae
A
2480/*
2481 * do_aio_fsync
2482 */
2483static int
2484do_aio_fsync( aio_workq_entry *entryp )
2485{
91447636
A
2486 struct vfs_context context;
2487 struct vnode *vp;
2488 struct fileproc *fp;
b0d623f7
A
2489 int sync_flag;
2490 int error;
91447636 2491
b0d623f7
A
2492 /*
2493 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2494 *
2495 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2496 * to mark for update the metadata not strictly necessary for data
2497 * retrieval, rather than forcing it to disk.
2498 *
2499 * If AIO_FSYNC is set, we have to also wait for metadata not really
2500 * necessary to data retrival are committed to stable storage (e.g.
2501 * atime, mtime, ctime, etc.).
2502 *
2503 * Metadata necessary for data retrieval ust be committed to stable
2504 * storage in either case (file length, etc.).
2505 */
2506 if (entryp->flags & AIO_FSYNC)
2507 sync_flag = MNT_WAIT;
2508 else
2509 sync_flag = MNT_DWAIT;
2510
91447636 2511 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
55e303ae 2512 if ( error == 0 ) {
91447636
A
2513 if ( (error = vnode_getwithref(vp)) ) {
2514 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2515 entryp->returnval = -1;
2516 return(error);
2517 }
2d21ac55 2518 context.vc_thread = current_thread();
91447636
A
2519 context.vc_ucred = fp->f_fglob->fg_cred;
2520
b0d623f7 2521 error = VNOP_FSYNC( vp, sync_flag, &context);
91447636
A
2522
2523 (void)vnode_put(vp);
2524
2525 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
55e303ae
A
2526 }
2527 if ( error != 0 )
2528 entryp->returnval = -1;
2529
2530 return( error );
2531
2532} /* do_aio_fsync */
2533
2534
2535/*
2536 * is_already_queued - runs through our queues to see if the given
2537 * aiocbp / process is there. Returns TRUE if there is a match
2538 * on any of our aio queues.
b0d623f7
A
2539 *
2540 * Called with proc aio lock held (can be held spin)
55e303ae 2541 */
55e303ae 2542static boolean_t
2d21ac55 2543is_already_queued(proc_t procp,
91447636 2544 user_addr_t aiocbp )
55e303ae
A
2545{
2546 aio_workq_entry *entryp;
2547 boolean_t result;
b0d623f7 2548
55e303ae
A
2549 result = FALSE;
2550
2551 /* look for matches on our queue of async IO requests that have completed */
b0d623f7 2552 TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
55e303ae
A
2553 if ( aiocbp == entryp->uaiocbp ) {
2554 result = TRUE;
2555 goto ExitThisRoutine;
2556 }
2557 }
2558
2559 /* look for matches on our queue of active async IO requests */
b0d623f7 2560 TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
55e303ae
A
2561 if ( aiocbp == entryp->uaiocbp ) {
2562 result = TRUE;
2563 goto ExitThisRoutine;
2564 }
2565 }
2566
55e303ae
A
2567ExitThisRoutine:
2568 return( result );
2569
2570} /* is_already_queued */
2571
2572
b0d623f7
A
2573static void
2574free_lio_context(aio_lio_context* context)
2575{
2576
2577#if DEBUG
2578 OSDecrementAtomic(&lio_contexts_alloced);
2579#endif /* DEBUG */
2580
2581 FREE( context, M_TEMP );
2582
2583} /* free_lio_context */
2584
2585
55e303ae
A
2586/*
2587 * aio initialization
2588 */
2589__private_extern__ void
2590aio_init( void )
2591{
2592 int i;
2593
91447636 2594 aio_lock_grp_attr = lck_grp_attr_alloc_init();
b0d623f7
A
2595 aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2596 aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2597 aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
91447636 2598 aio_lock_attr = lck_attr_alloc_init();
91447636 2599
b0d623f7
A
2600 lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2601 lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
55e303ae 2602
b0d623f7 2603 aio_anchor.aio_inflight_count = 0;
55e303ae 2604 aio_anchor.aio_done_count = 0;
b0d623f7
A
2605 aio_anchor.aio_total_count = 0;
2606 aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2607
2608 for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2609 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2610 }
2611
55e303ae
A
2612
2613 i = sizeof( aio_workq_entry );
2614 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2615
2616 _aio_create_worker_threads( aio_worker_threads );
55e303ae
A
2617
2618} /* aio_init */
2619
2620
2621/*
2622 * aio worker threads created here.
2623 */
2624__private_extern__ void
2625_aio_create_worker_threads( int num )
2626{
2627 int i;
2628
2629 /* create some worker threads to handle the async IO requests */
2630 for ( i = 0; i < num; i++ ) {
2631 thread_t myThread;
2632
b0d623f7 2633 if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
55e303ae
A
2634 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2635 }
b0d623f7
A
2636 else
2637 thread_deallocate(myThread);
55e303ae
A
2638 }
2639
2640 return;
2641
2642} /* _aio_create_worker_threads */
2643
2644/*
2645 * Return the current activation utask
2646 */
2647task_t
2648get_aiotask(void)
2649{
91447636
A
2650 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2651}
2652
2653
2654/*
2655 * In the case of an aiocb from a
2656 * 32-bit process we need to expand some longs and pointers to the correct
2657 * sizes in order to let downstream code always work on the same type of
2658 * aiocb (in our case that is a user_aiocb)
2659 */
2660static void
b0d623f7 2661do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
91447636
A
2662{
2663 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2664 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2665 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2666 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2667 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2668 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2669
2670 /* special case here. since we do not know if sigev_value is an */
2671 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2672 /* means if we send this info back to user space we need to remember */
2673 /* sigev_value was not expanded for the 32-bit case. */
2674 /* NOTE - this does NOT affect us since we don't support sigev_value */
2675 /* yet in the aio context. */
2676 //LP64
2677 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2678 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2679 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2680 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2681 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2682 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2683 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2684 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
55e303ae 2685}
b0d623f7
A
2686
2687/* Similar for 64-bit user process, so that we don't need to satisfy
2688 * the alignment constraints of the original user64_aiocb
2689 */
2690static void
2691do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2692{
2693 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2694 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2695 the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2696 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2697 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2698 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2699
2700 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2701 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2702 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2703 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2704 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2705 my_aiocbp->aio_sigevent.sigev_notify_function;
2706 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2707 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2708}