]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_aio.c
xnu-2782.40.9.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
CommitLineData
55e303ae 1/*
fe8ab488 2 * Copyright (c) 2003-2014 Apple Inc. All rights reserved.
55e303ae 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
55e303ae 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
55e303ae
A
27 */
28
29
30/*
31 * todo:
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
38 */
39
40
41/*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45#include <sys/systm.h>
55e303ae 46#include <sys/fcntl.h>
91447636 47#include <sys/file_internal.h>
55e303ae
A
48#include <sys/filedesc.h>
49#include <sys/kernel.h>
91447636 50#include <sys/vnode_internal.h>
55e303ae 51#include <sys/malloc.h>
91447636 52#include <sys/mount_internal.h>
55e303ae 53#include <sys/param.h>
91447636 54#include <sys/proc_internal.h>
55e303ae
A
55#include <sys/sysctl.h>
56#include <sys/unistd.h>
57#include <sys/user.h>
58
59#include <sys/aio_kern.h>
91447636 60#include <sys/sysproto.h>
55e303ae
A
61
62#include <machine/limits.h>
91447636
A
63
64#include <mach/mach_types.h>
65#include <kern/kern_types.h>
55e303ae
A
66#include <kern/zalloc.h>
67#include <kern/task.h>
91447636
A
68#include <kern/sched_prim.h>
69
70#include <vm/vm_map.h>
55e303ae 71
b0d623f7
A
72#include <libkern/OSAtomic.h>
73
55e303ae
A
74#include <sys/kdebug.h>
75#define AIO_work_queued 1
76#define AIO_worker_wake 2
77#define AIO_completion_sig 3
78#define AIO_completion_cleanup_wait 4
79#define AIO_completion_cleanup_wake 5
80#define AIO_completion_suspend_wake 6
81#define AIO_fsync_delay 7
82#define AIO_cancel 10
83#define AIO_cancel_async_workq 11
84#define AIO_cancel_sync_workq 12
85#define AIO_cancel_activeq 13
86#define AIO_cancel_doneq 14
87#define AIO_fsync 20
88#define AIO_read 30
89#define AIO_write 40
90#define AIO_listio 50
91#define AIO_error 60
92#define AIO_error_val 61
93#define AIO_error_activeq 62
94#define AIO_error_workq 63
95#define AIO_return 70
96#define AIO_return_val 71
97#define AIO_return_activeq 72
98#define AIO_return_workq 73
99#define AIO_exec 80
100#define AIO_exit 90
101#define AIO_exit_sleep 91
102#define AIO_close 100
103#define AIO_close_sleep 101
104#define AIO_suspend 110
105#define AIO_suspend_sleep 111
106#define AIO_worker_thread 120
107
108#if 0
109#undef KERNEL_DEBUG
110#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
111#endif
112
113/*
114 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
115 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
116 * (proc.aio_activeq) when one of our worker threads start the IO.
117 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
118 * when the IO request completes. The request remains on aio_doneq until
119 * user process calls aio_return or the process exits, either way that is our
120 * trigger to release aio resources.
121 */
b0d623f7
A
122typedef struct aio_workq {
123 TAILQ_HEAD(, aio_workq_entry) aioq_entries;
124 int aioq_count;
125 lck_mtx_t aioq_mtx;
126 wait_queue_t aioq_waitq;
127} *aio_workq_t;
128
129#define AIO_NUM_WORK_QUEUES 1
55e303ae
A
130struct aio_anchor_cb
131{
b0d623f7
A
132 volatile int32_t aio_inflight_count; /* entries that have been taken from a workq */
133 volatile int32_t aio_done_count; /* entries on all done queues (proc.aio_doneq) */
134 volatile int32_t aio_total_count; /* total extant entries */
135
136 /* Hash table of queues here */
137 int aio_num_workqs;
138 struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
55e303ae
A
139};
140typedef struct aio_anchor_cb aio_anchor_cb;
141
b0d623f7
A
142struct aio_lio_context
143{
144 int io_waiter;
145 int io_issued;
146 int io_completed;
147};
148typedef struct aio_lio_context aio_lio_context;
149
55e303ae
A
150
151/*
152 * Notes on aio sleep / wake channels.
153 * We currently pick a couple fields within the proc structure that will allow
154 * us sleep channels that currently do not collide with any other kernel routines.
155 * At this time, for binary compatibility reasons, we cannot create new proc fields.
156 */
b0d623f7
A
157#define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
158#define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
55e303ae 159
b0d623f7
A
160#define ASSERT_AIO_FROM_PROC(aiop, theproc) \
161 if ((aiop)->procp != (theproc)) { \
162 panic("AIO on a proc list that does not belong to that proc.\n"); \
163 }
55e303ae
A
164
165/*
166 * LOCAL PROTOTYPES
167 */
b0d623f7
A
168static void aio_proc_lock(proc_t procp);
169static void aio_proc_lock_spin(proc_t procp);
170static void aio_proc_unlock(proc_t procp);
171static lck_mtx_t* aio_proc_mutex(proc_t procp);
172static void aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
173static void aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
174static int aio_get_process_count(proc_t procp );
175static int aio_active_requests_for_process(proc_t procp );
176static int aio_proc_active_requests_for_file(proc_t procp, int fd);
177static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp );
178static boolean_t should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
179
180static void aio_entry_lock(aio_workq_entry *entryp);
181static void aio_entry_lock_spin(aio_workq_entry *entryp);
182static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
183static lck_mtx_t* aio_entry_mutex(__unused aio_workq_entry *entryp);
184static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
185static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
186static void aio_entry_ref_locked(aio_workq_entry *entryp);
187static void aio_entry_unref_locked(aio_workq_entry *entryp);
188static void aio_entry_ref(aio_workq_entry *entryp);
189static void aio_entry_unref(aio_workq_entry *entryp);
190static void aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
191 int wait_for_completion, boolean_t disable_notification);
192static int aio_entry_try_workq_remove(aio_workq_entry *entryp);
55e303ae 193static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
b0d623f7
A
194static int aio_free_request(aio_workq_entry *entryp);
195
196static void aio_workq_init(aio_workq_t wq);
197static void aio_workq_lock_spin(aio_workq_t wq);
198static void aio_workq_unlock(aio_workq_t wq);
199static lck_mtx_t* aio_workq_mutex(aio_workq_t wq);
200
201static void aio_work_thread( void );
202static aio_workq_entry *aio_get_some_work( void );
203
204static int aio_get_all_queues_count( void );
205static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
206static int aio_validate( aio_workq_entry *entryp );
207static int aio_increment_total_count(void);
208static int aio_decrement_total_count(void);
209
210static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
211static void do_aio_completion( aio_workq_entry *entryp );
212static int do_aio_fsync( aio_workq_entry *entryp );
213static int do_aio_read( aio_workq_entry *entryp );
214static int do_aio_write( aio_workq_entry *entryp );
215static void do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
216static void do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
217static int lio_create_entry(proc_t procp,
218 user_addr_t aiocbp,
219 void *group_tag,
220 aio_workq_entry **entrypp );
221static aio_workq_entry *aio_create_queue_entry(proc_t procp,
222 user_addr_t aiocbp,
223 void *group_tag,
224 int kindOfIO);
225static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
226static void free_lio_context(aio_lio_context* context);
227static void aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
228
229#define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
230#define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
231#define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
91447636 232
55e303ae
A
233/*
234 * EXTERNAL PROTOTYPES
235 */
236
237/* in ...bsd/kern/sys_generic.c */
b0d623f7
A
238extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
239 user_addr_t bufp, user_size_t nbyte,
240 off_t offset, int flags, user_ssize_t *retval );
241extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
242 user_addr_t bufp, user_size_t nbyte, off_t offset,
243 int flags, user_ssize_t *retval );
244#if DEBUG
245static uint32_t lio_contexts_alloced = 0;
246#endif /* DEBUG */
55e303ae
A
247
248/*
249 * aio external global variables.
250 */
b0d623f7 251extern int aio_max_requests; /* AIO_MAX - configurable */
55e303ae 252extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
b0d623f7 253extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
55e303ae
A
254
255
256/*
257 * aio static variables.
258 */
b0d623f7
A
259static aio_anchor_cb aio_anchor;
260static lck_grp_t *aio_proc_lock_grp;
261static lck_grp_t *aio_entry_lock_grp;
262static lck_grp_t *aio_queue_lock_grp;
263static lck_attr_t *aio_lock_attr;
264static lck_grp_attr_t *aio_lock_grp_attr;
265static struct zone *aio_workq_zonep;
266static lck_mtx_t aio_entry_mtx;
267static lck_mtx_t aio_proc_mtx;
268
269static void
270aio_entry_lock(__unused aio_workq_entry *entryp)
271{
272 lck_mtx_lock(&aio_entry_mtx);
273}
274
275static void
276aio_entry_lock_spin(__unused aio_workq_entry *entryp)
277{
278 lck_mtx_lock_spin(&aio_entry_mtx);
279}
280
281static void
282aio_entry_unlock(__unused aio_workq_entry *entryp)
283{
284 lck_mtx_unlock(&aio_entry_mtx);
285}
286
287/* Hash */
288static aio_workq_t
289aio_entry_workq(__unused aio_workq_entry *entryp)
290{
291 return &aio_anchor.aio_async_workqs[0];
292}
293
294static lck_mtx_t*
295aio_entry_mutex(__unused aio_workq_entry *entryp)
296{
297 return &aio_entry_mtx;
298}
299
300static void
301aio_workq_init(aio_workq_t wq)
302{
303 TAILQ_INIT(&wq->aioq_entries);
304 wq->aioq_count = 0;
305 lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
306 wq->aioq_waitq = wait_queue_alloc(SYNC_POLICY_FIFO);
307}
308
309
310/*
311 * Can be passed a queue which is locked spin.
312 */
313static void
314aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
315{
316 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
317
318 if (entryp->aio_workq_link.tqe_prev == NULL) {
319 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
320 }
321
322 TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
323 queue->aioq_count--;
324 entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
325
326 if (queue->aioq_count < 0) {
327 panic("Negative count on a queue.\n");
328 }
329}
330
331static void
332aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
333{
334 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
335
336 TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
337 if (queue->aioq_count < 0) {
338 panic("Negative count on a queue.\n");
339 }
340 queue->aioq_count++;
341}
342
343static void
344aio_proc_lock(proc_t procp)
345{
346 lck_mtx_lock(aio_proc_mutex(procp));
347}
348
349static void
350aio_proc_lock_spin(proc_t procp)
351{
352 lck_mtx_lock_spin(aio_proc_mutex(procp));
353}
354
355static void
356aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
357{
358 ASSERT_AIO_PROC_LOCK_OWNED(procp);
359
360 TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
361 TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
362 procp->p_aio_active_count--;
363 OSIncrementAtomic(&aio_anchor.aio_done_count);
364}
365
366static void
367aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
368{
369 TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
370 OSDecrementAtomic(&aio_anchor.aio_done_count);
371 aio_decrement_total_count();
372 procp->p_aio_total_count--;
373}
374
375static void
376aio_proc_unlock(proc_t procp)
377{
378 lck_mtx_unlock(aio_proc_mutex(procp));
379}
380
381static lck_mtx_t*
382aio_proc_mutex(proc_t procp)
383{
384 return &procp->p_mlock;
385}
386
387static void
388aio_entry_ref_locked(aio_workq_entry *entryp)
389{
390 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
391
392 if (entryp->aio_refcount < 0) {
393 panic("AIO workq entry with a negative refcount.\n");
394 }
395 entryp->aio_refcount++;
396}
397
398
399/* Return 1 if you've freed it */
400static void
401aio_entry_unref_locked(aio_workq_entry *entryp)
402{
403 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
404
405 entryp->aio_refcount--;
406 if (entryp->aio_refcount < 0) {
407 panic("AIO workq entry with a negative refcount.\n");
408 }
409}
410
411static void
412aio_entry_ref(aio_workq_entry *entryp)
413{
414 aio_entry_lock_spin(entryp);
415 aio_entry_ref_locked(entryp);
416 aio_entry_unlock(entryp);
417}
418static void
419aio_entry_unref(aio_workq_entry *entryp)
420{
421 aio_entry_lock_spin(entryp);
422 aio_entry_unref_locked(entryp);
423
424 if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
425 aio_entry_unlock(entryp);
426 aio_free_request(entryp);
427 } else {
428 aio_entry_unlock(entryp);
429 }
430
431 return;
432}
433
434static void
435aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
436{
437 aio_entry_lock_spin(entryp);
438
439 if (cancelled) {
440 aio_entry_ref_locked(entryp);
441 entryp->errorval = ECANCELED;
442 entryp->returnval = -1;
443 }
444
445 if ( wait_for_completion ) {
446 entryp->flags |= wait_for_completion; /* flag for special completion processing */
447 }
448
449 if ( disable_notification ) {
450 entryp->flags |= AIO_DISABLE; /* Don't want a signal */
451 }
452
453 aio_entry_unlock(entryp);
454}
455
456static int
457aio_entry_try_workq_remove(aio_workq_entry *entryp)
458{
459 /* Can only be cancelled if it's still on a work queue */
460 if (entryp->aio_workq_link.tqe_prev != NULL) {
461 aio_workq_t queue;
462
463 /* Will have to check again under the lock */
464 queue = aio_entry_workq(entryp);
465 aio_workq_lock_spin(queue);
466 if (entryp->aio_workq_link.tqe_prev != NULL) {
467 aio_workq_remove_entry_locked(queue, entryp);
468 aio_workq_unlock(queue);
469 return 1;
470 } else {
471 aio_workq_unlock(queue);
472 }
473 }
55e303ae 474
b0d623f7
A
475 return 0;
476}
477
478static void
479aio_workq_lock_spin(aio_workq_t wq)
480{
481 lck_mtx_lock_spin(aio_workq_mutex(wq));
482}
55e303ae 483
b0d623f7
A
484static void
485aio_workq_unlock(aio_workq_t wq)
486{
487 lck_mtx_unlock(aio_workq_mutex(wq));
488}
55e303ae 489
b0d623f7
A
490static lck_mtx_t*
491aio_workq_mutex(aio_workq_t wq)
492{
493 return &wq->aioq_mtx;
494}
55e303ae
A
495
496/*
497 * aio_cancel - attempt to cancel one or more async IO requests currently
498 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
499 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
500 * is NULL then all outstanding async IO request for the given file
501 * descriptor are cancelled (if possible).
502 */
55e303ae 503int
2d21ac55 504aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
55e303ae 505{
91447636 506 struct user_aiocb my_aiocb;
55e303ae 507 int result;
55e303ae
A
508
509 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
510 (int)p, (int)uap->aiocbp, 0, 0, 0 );
511
512 /* quick check to see if there are any async IO requests queued up */
b0d623f7 513 if (aio_get_all_queues_count() < 1) {
2d21ac55
A
514 result = 0;
515 *retval = AIO_ALLDONE;
55e303ae
A
516 goto ExitRoutine;
517 }
518
519 *retval = -1;
91447636 520 if ( uap->aiocbp != USER_ADDR_NULL ) {
b0d623f7
A
521 if ( proc_is64bit(p) ) {
522 struct user64_aiocb aiocb64;
523
524 result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
525 if (result == 0 )
526 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
527
528 } else {
529 struct user32_aiocb aiocb32;
91447636
A
530
531 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
532 if ( result == 0 )
b0d623f7
A
533 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
534 }
91447636 535
55e303ae
A
536 if ( result != 0 ) {
537 result = EAGAIN;
538 goto ExitRoutine;
539 }
540
541 /* NOTE - POSIX standard says a mismatch between the file */
542 /* descriptor passed in and the file descriptor embedded in */
543 /* the aiocb causes unspecified results. We return EBADF in */
544 /* that situation. */
545 if ( uap->fd != my_aiocb.aio_fildes ) {
546 result = EBADF;
547 goto ExitRoutine;
548 }
549 }
b0d623f7
A
550
551 aio_proc_lock(p);
552 result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
553 ASSERT_AIO_PROC_LOCK_OWNED(p);
554 aio_proc_unlock(p);
55e303ae
A
555
556 if ( result != -1 ) {
557 *retval = result;
558 result = 0;
559 goto ExitRoutine;
560 }
561
562 result = EBADF;
563
564ExitRoutine:
565 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
566 (int)p, (int)uap->aiocbp, result, 0, 0 );
567
568 return( result );
569
570} /* aio_cancel */
571
572
573/*
574 * _aio_close - internal function used to clean up async IO requests for
575 * a file descriptor that is closing.
55e303ae
A
576 * THIS MAY BLOCK.
577 */
55e303ae 578__private_extern__ void
2d21ac55 579_aio_close(proc_t p, int fd )
55e303ae 580{
b0d623f7 581 int error;
55e303ae
A
582
583 /* quick check to see if there are any async IO requests queued up */
b0d623f7 584 if (aio_get_all_queues_count() < 1) {
55e303ae 585 return;
b0d623f7 586 }
55e303ae
A
587
588 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
589 (int)p, fd, 0, 0, 0 );
590
591 /* cancel all async IO requests on our todo queues for this file descriptor */
b0d623f7
A
592 aio_proc_lock(p);
593 error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
594 ASSERT_AIO_PROC_LOCK_OWNED(p);
55e303ae
A
595 if ( error == AIO_NOTCANCELED ) {
596 /*
597 * AIO_NOTCANCELED is returned when we find an aio request for this process
598 * and file descriptor on the active async IO queue. Active requests cannot
599 * be cancelled so we must wait for them to complete. We will get a special
600 * wake up call on our channel used to sleep for ALL active requests to
601 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
602 * when we must wait for all active aio requests.
603 */
604
605 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
606 (int)p, fd, 0, 0, 0 );
607
b0d623f7 608 while (aio_proc_active_requests_for_file(p, fd) > 0) {
39236c6e 609 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 );
b0d623f7
A
610 }
611
55e303ae 612 }
39236c6e
A
613
614 aio_proc_unlock(p);
615
55e303ae
A
616 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
617 (int)p, fd, 0, 0, 0 );
618
619 return;
620
621} /* _aio_close */
622
623
624/*
625 * aio_error - return the error status associated with the async IO
626 * request referred to by uap->aiocbp. The error status is the errno
627 * value that would be set by the corresponding IO request (read, wrtie,
628 * fdatasync, or sync).
629 */
55e303ae 630int
2d21ac55 631aio_error(proc_t p, struct aio_error_args *uap, int *retval )
55e303ae
A
632{
633 aio_workq_entry *entryp;
634 int error;
635
636 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
637 (int)p, (int)uap->aiocbp, 0, 0, 0 );
638
b0d623f7
A
639 /* see if there are any aios to check */
640 if (aio_get_all_queues_count() < 1) {
641 return EINVAL;
55e303ae
A
642 }
643
b0d623f7
A
644 aio_proc_lock(p);
645
55e303ae 646 /* look for a match on our queue of async IO requests that have completed */
b0d623f7 647 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
55e303ae 648 if ( entryp->uaiocbp == uap->aiocbp ) {
b0d623f7
A
649 ASSERT_AIO_FROM_PROC(entryp, p);
650
651 aio_entry_lock_spin(entryp);
55e303ae
A
652 *retval = entryp->errorval;
653 error = 0;
b0d623f7 654 aio_entry_unlock(entryp);
55e303ae
A
655 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
656 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
657 goto ExitRoutine;
658 }
659 }
660
661 /* look for a match on our queue of active async IO requests */
b0d623f7 662 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
55e303ae 663 if ( entryp->uaiocbp == uap->aiocbp ) {
b0d623f7 664 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae
A
665 *retval = EINPROGRESS;
666 error = 0;
667 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
668 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
669 goto ExitRoutine;
670 }
671 }
b0d623f7 672
55e303ae
A
673 error = EINVAL;
674
675ExitRoutine:
676 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
677 (int)p, (int)uap->aiocbp, error, 0, 0 );
b0d623f7 678 aio_proc_unlock(p);
55e303ae
A
679
680 return( error );
681
682} /* aio_error */
683
684
685/*
686 * aio_fsync - asynchronously force all IO operations associated
687 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
688 * queued at the time of the call to the synchronized completion state.
689 * NOTE - we do not support op O_DSYNC at this point since we do not support the
690 * fdatasync() call.
691 */
55e303ae 692int
2d21ac55 693aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
55e303ae
A
694{
695 int error;
696 int fsync_kind;
697
698 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
699 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
700
701 *retval = 0;
91447636
A
702 /* 0 := O_SYNC for binary backward compatibility with Panther */
703 if (uap->op == O_SYNC || uap->op == 0)
55e303ae 704 fsync_kind = AIO_FSYNC;
55e303ae
A
705 else if ( uap->op == O_DSYNC )
706 fsync_kind = AIO_DSYNC;
55e303ae
A
707 else {
708 *retval = -1;
709 error = EINVAL;
710 goto ExitRoutine;
711 }
712
713 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
714 if ( error != 0 )
715 *retval = -1;
716
717ExitRoutine:
718 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
719 (int)p, (int)uap->aiocbp, error, 0, 0 );
720
721 return( error );
722
723} /* aio_fsync */
724
725
726/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
727 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
728 * (uap->aiocbp->aio_buf).
729 */
55e303ae 730int
2d21ac55 731aio_read(proc_t p, struct aio_read_args *uap, int *retval )
55e303ae
A
732{
733 int error;
734
735 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
736 (int)p, (int)uap->aiocbp, 0, 0, 0 );
737
738 *retval = 0;
739
740 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
741 if ( error != 0 )
742 *retval = -1;
743
744 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
745 (int)p, (int)uap->aiocbp, error, 0, 0 );
746
747 return( error );
748
749} /* aio_read */
750
751
752/*
753 * aio_return - return the return status associated with the async IO
754 * request referred to by uap->aiocbp. The return status is the value
b0d623f7 755 * that would be returned by corresponding IO request (read, write,
55e303ae
A
756 * fdatasync, or sync). This is where we release kernel resources
757 * held for async IO call associated with the given aiocb pointer.
758 */
55e303ae 759int
2d21ac55 760aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
55e303ae
A
761{
762 aio_workq_entry *entryp;
763 int error;
b0d623f7 764 boolean_t proc_lock_held = FALSE;
55e303ae
A
765
766 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
767 (int)p, (int)uap->aiocbp, 0, 0, 0 );
768
b0d623f7
A
769 /* See if there are any entries to check */
770 if (aio_get_all_queues_count() < 1) {
55e303ae
A
771 error = EINVAL;
772 goto ExitRoutine;
773 }
774
b0d623f7
A
775 aio_proc_lock(p);
776 proc_lock_held = TRUE;
777 *retval = 0;
778
55e303ae 779 /* look for a match on our queue of async IO requests that have completed */
b0d623f7
A
780 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
781 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae 782 if ( entryp->uaiocbp == uap->aiocbp ) {
b0d623f7
A
783 /* Done and valid for aio_return(), pull it off the list */
784 aio_proc_remove_done_locked(p, entryp);
55e303ae 785
b0d623f7
A
786 /* Drop the proc lock, but keep the entry locked */
787 aio_entry_lock(entryp);
788 aio_proc_unlock(p);
789 proc_lock_held = FALSE;
790
55e303ae 791 *retval = entryp->returnval;
b0d623f7 792 error = 0;
55e303ae 793
b0d623f7
A
794 /* No references and off all lists, safe to free */
795 if (entryp->aio_refcount == 0) {
796 aio_entry_unlock(entryp);
797 aio_free_request(entryp);
55e303ae 798 }
b0d623f7
A
799 else {
800 /* Whoever has the refcount will have to free it */
55e303ae 801 entryp->flags |= AIO_DO_FREE;
b0d623f7
A
802 aio_entry_unlock(entryp);
803 }
804
805
55e303ae
A
806 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
807 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
808 goto ExitRoutine;
809 }
810 }
811
812 /* look for a match on our queue of active async IO requests */
b0d623f7
A
813 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
814 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae
A
815 if ( entryp->uaiocbp == uap->aiocbp ) {
816 error = EINPROGRESS;
817 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
818 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
819 goto ExitRoutine;
820 }
821 }
822
55e303ae
A
823 error = EINVAL;
824
825ExitRoutine:
b0d623f7
A
826 if (proc_lock_held)
827 aio_proc_unlock(p);
55e303ae
A
828 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
829 (int)p, (int)uap->aiocbp, error, 0, 0 );
830
831 return( error );
832
833} /* aio_return */
834
835
836/*
837 * _aio_exec - internal function used to clean up async IO requests for
838 * a process that is going away due to exec(). We cancel any async IOs
839 * we can and wait for those already active. We also disable signaling
840 * for cancelled or active aio requests that complete.
55e303ae
A
841 * This routine MAY block!
842 */
55e303ae 843__private_extern__ void
2d21ac55 844_aio_exec(proc_t p )
55e303ae
A
845{
846
847 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
848 (int)p, 0, 0, 0, 0 );
849
850 _aio_exit( p );
851
852 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
853 (int)p, 0, 0, 0, 0 );
854
855 return;
856
857} /* _aio_exec */
858
859
860/*
861 * _aio_exit - internal function used to clean up async IO requests for
862 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
863 * we can and wait for those already active. We also disable signaling
864 * for cancelled or active aio requests that complete. This routine MAY block!
55e303ae 865 */
55e303ae 866__private_extern__ void
2d21ac55 867_aio_exit(proc_t p )
55e303ae 868{
b0d623f7 869 int error;
55e303ae
A
870 aio_workq_entry *entryp;
871
b0d623f7 872
55e303ae 873 /* quick check to see if there are any async IO requests queued up */
b0d623f7 874 if (aio_get_all_queues_count() < 1) {
55e303ae
A
875 return;
876 }
877
878 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
879 (int)p, 0, 0, 0, 0 );
880
b0d623f7
A
881 aio_proc_lock(p);
882
55e303ae
A
883 /*
884 * cancel async IO requests on the todo work queue and wait for those
885 * already active to complete.
886 */
b0d623f7
A
887 error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
888 ASSERT_AIO_PROC_LOCK_OWNED(p);
55e303ae
A
889 if ( error == AIO_NOTCANCELED ) {
890 /*
891 * AIO_NOTCANCELED is returned when we find an aio request for this process
892 * on the active async IO queue. Active requests cannot be cancelled so we
893 * must wait for them to complete. We will get a special wake up call on
894 * our channel used to sleep for ALL active requests to complete. This sleep
895 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
896 * active aio requests.
897 */
898
899 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
900 (int)p, 0, 0, 0, 0 );
901
b0d623f7
A
902 while (p->p_aio_active_count != 0) {
903 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
904 }
905 }
906
907 if (p->p_aio_active_count != 0) {
908 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
55e303ae
A
909 }
910
911 /* release all aio resources used by this process */
b0d623f7 912 entryp = TAILQ_FIRST( &p->p_aio_doneq );
55e303ae 913 while ( entryp != NULL ) {
b0d623f7 914 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae
A
915 aio_workq_entry *next_entryp;
916
b0d623f7
A
917 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
918 aio_proc_remove_done_locked(p, entryp);
55e303ae
A
919
920 /* we cannot free requests that are still completing */
b0d623f7
A
921 aio_entry_lock_spin(entryp);
922 if (entryp->aio_refcount == 0) {
923 aio_proc_unlock(p);
924 aio_entry_unlock(entryp);
925 aio_free_request(entryp);
55e303ae
A
926
927 /* need to start over since aio_doneq may have been */
928 /* changed while we were away. */
b0d623f7
A
929 aio_proc_lock(p);
930 entryp = TAILQ_FIRST( &p->p_aio_doneq );
55e303ae
A
931 continue;
932 }
b0d623f7
A
933 else {
934 /* whoever has the reference will have to do the free */
55e303ae 935 entryp->flags |= AIO_DO_FREE;
b0d623f7
A
936 }
937
938 aio_entry_unlock(entryp);
55e303ae
A
939 entryp = next_entryp;
940 }
b0d623f7
A
941
942 aio_proc_unlock(p);
943
55e303ae
A
944 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
945 (int)p, 0, 0, 0, 0 );
55e303ae
A
946 return;
947
948} /* _aio_exit */
949
950
b0d623f7
A
951static boolean_t
952should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
953{
954 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
955 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
956 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
957 return TRUE;
958 }
959
960 return FALSE;
961}
962
55e303ae 963/*
b0d623f7 964 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
55e303ae
A
965 * aio_cancel, close, and at exit.
966 * There are three modes of operation: 1) cancel all async IOs for a process -
967 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
968 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
969 * aiocbp.
970 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
971 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
972 * target async IO requests, and AIO_ALLDONE if all target async IO requests
973 * were already complete.
974 * WARNING - do not deference aiocbp in this routine, it may point to user
975 * land data that has not been copied in (when called from aio_cancel() )
b0d623f7
A
976 *
977 * Called with proc locked, and returns the same way.
55e303ae 978 */
55e303ae 979static int
b0d623f7
A
980do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
981 int wait_for_completion, boolean_t disable_notification )
55e303ae 982{
b0d623f7
A
983 ASSERT_AIO_PROC_LOCK_OWNED(p);
984
55e303ae
A
985 aio_workq_entry *entryp;
986 int result;
987
988 result = -1;
989
990 /* look for a match on our queue of async todo work. */
b0d623f7 991 entryp = TAILQ_FIRST(&p->p_aio_activeq);
55e303ae 992 while ( entryp != NULL ) {
b0d623f7 993 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae 994 aio_workq_entry *next_entryp;
55e303ae 995
b0d623f7
A
996 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
997 if (!should_cancel(entryp, aiocbp, fd)) {
998 entryp = next_entryp;
999 continue;
55e303ae 1000 }
b0d623f7
A
1001
1002 /* Can only be cancelled if it's still on a work queue */
1003 if (aio_entry_try_workq_remove(entryp) != 0) {
1004 /* Have removed from workq. Update entry state and take a ref */
1005 aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
1006
1007 /* Put on the proc done queue and update counts, then unlock the proc */
1008 aio_proc_move_done_locked(p, entryp);
1009 aio_proc_unlock(p);
1010
1011 /* Now it's officially cancelled. Do the completion */
1012 result = AIO_CANCELED;
1013 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1014 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1015 do_aio_completion(entryp);
1016
1017 /* This will free if the aio_return() has already happened ... */
1018 aio_entry_unref(entryp);
1019 aio_proc_lock(p);
1020
1021 if ( aiocbp != USER_ADDR_NULL ) {
1022 return( result );
55e303ae 1023 }
55e303ae 1024
b0d623f7
A
1025 /*
1026 * Restart from the head of the proc active queue since it
1027 * may have been changed while we were away doing completion
1028 * processing.
1029 *
1030 * Note that if we found an uncancellable AIO before, we will
1031 * either find it again or discover that it's been completed,
1032 * so resetting the result will not cause us to return success
1033 * despite outstanding AIOs.
1034 */
1035 entryp = TAILQ_FIRST(&p->p_aio_activeq);
1036 result = -1; /* As if beginning anew */
1037 } else {
1038 /*
1039 * It's been taken off the active queue already, i.e. is in flight.
1040 * All we can do is ask for notification.
1041 */
55e303ae
A
1042 result = AIO_NOTCANCELED;
1043
1044 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
b0d623f7
A
1045 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1046
1047 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1048 aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
55e303ae 1049
91447636 1050 if ( aiocbp != USER_ADDR_NULL ) {
55e303ae
A
1051 return( result );
1052 }
b0d623f7 1053 entryp = next_entryp;
55e303ae 1054 }
b0d623f7
A
1055 } /* while... */
1056
55e303ae
A
1057 /*
1058 * if we didn't find any matches on the todo or active queues then look for a
1059 * match on our queue of async IO requests that have completed and if found
1060 * return AIO_ALLDONE result.
b0d623f7
A
1061 *
1062 * Proc AIO lock is still held.
55e303ae
A
1063 */
1064 if ( result == -1 ) {
b0d623f7
A
1065 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1066 ASSERT_AIO_FROM_PROC(entryp, p);
1067 if (should_cancel(entryp, aiocbp, fd)) {
55e303ae 1068 result = AIO_ALLDONE;
55e303ae 1069 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
b0d623f7 1070 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
55e303ae 1071
91447636 1072 if ( aiocbp != USER_ADDR_NULL ) {
55e303ae
A
1073 return( result );
1074 }
1075 }
1076 }
1077 }
55e303ae
A
1078
1079 return( result );
1080
b0d623f7
A
1081}
1082 /* do_aio_cancel_locked */
55e303ae
A
1083
1084
1085/*
1086 * aio_suspend - suspend the calling thread until at least one of the async
1087 * IO operations referenced by uap->aiocblist has completed, until a signal
1088 * interrupts the function, or uap->timeoutp time interval (optional) has
1089 * passed.
1090 * Returns 0 if one or more async IOs have completed else -1 and errno is
1091 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1092 * woke us up.
1093 */
2d21ac55
A
1094int
1095aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1096{
1097 __pthread_testcancel(1);
1098 return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
1099}
1100
55e303ae
A
1101
1102int
2d21ac55 1103aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
55e303ae
A
1104{
1105 int error;
1106 int i, count;
1107 uint64_t abstime;
91447636 1108 struct user_timespec ts;
55e303ae 1109 aio_workq_entry *entryp;
91447636 1110 user_addr_t *aiocbpp;
55e303ae
A
1111
1112 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1113 (int)p, uap->nent, 0, 0, 0 );
1114
1115 *retval = -1;
1116 abstime = 0;
1117 aiocbpp = NULL;
1118
b0d623f7 1119 count = aio_get_all_queues_count( );
55e303ae
A
1120 if ( count < 1 ) {
1121 error = EINVAL;
1122 goto ExitThisRoutine;
1123 }
1124
91447636 1125 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
55e303ae
A
1126 error = EINVAL;
1127 goto ExitThisRoutine;
1128 }
1129
91447636
A
1130 if ( uap->timeoutp != USER_ADDR_NULL ) {
1131 if ( proc_is64bit(p) ) {
b0d623f7
A
1132 struct user64_timespec temp;
1133 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1134 if ( error == 0 ) {
1135 ts.tv_sec = temp.tv_sec;
1136 ts.tv_nsec = temp.tv_nsec;
1137 }
91447636
A
1138 }
1139 else {
b0d623f7 1140 struct user32_timespec temp;
91447636
A
1141 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1142 if ( error == 0 ) {
1143 ts.tv_sec = temp.tv_sec;
1144 ts.tv_nsec = temp.tv_nsec;
1145 }
1146 }
55e303ae
A
1147 if ( error != 0 ) {
1148 error = EAGAIN;
1149 goto ExitThisRoutine;
1150 }
1151
2d21ac55 1152 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
55e303ae
A
1153 error = EINVAL;
1154 goto ExitThisRoutine;
1155 }
1156
1157 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1158 &abstime );
1159 clock_absolutetime_interval_to_deadline( abstime, &abstime );
1160 }
1161
b0d623f7 1162 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
55e303ae
A
1163 if ( aiocbpp == NULL ) {
1164 error = EAGAIN;
1165 goto ExitThisRoutine;
1166 }
1167
91447636 1168 /* check list of aio requests to see if any have completed */
2d21ac55 1169check_for_our_aiocbp:
b0d623f7 1170 aio_proc_lock_spin(p);
91447636
A
1171 for ( i = 0; i < uap->nent; i++ ) {
1172 user_addr_t aiocbp;
1173
55e303ae
A
1174 /* NULL elements are legal so check for 'em */
1175 aiocbp = *(aiocbpp + i);
91447636 1176 if ( aiocbp == USER_ADDR_NULL )
55e303ae 1177 continue;
91447636 1178
55e303ae 1179 /* return immediately if any aio request in the list is done */
b0d623f7
A
1180 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1181 ASSERT_AIO_FROM_PROC(entryp, p);
55e303ae 1182 if ( entryp->uaiocbp == aiocbp ) {
b0d623f7 1183 aio_proc_unlock(p);
55e303ae
A
1184 *retval = 0;
1185 error = 0;
55e303ae
A
1186 goto ExitThisRoutine;
1187 }
1188 }
55e303ae
A
1189 } /* for ( ; i < uap->nent; ) */
1190
1191 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1192 (int)p, uap->nent, 0, 0, 0 );
1193
1194 /*
1195 * wait for an async IO to complete or a signal fires or timeout expires.
1196 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1197 * interrupts us. If an async IO completes before a signal fires or our
91447636 1198 * timeout expires, we get a wakeup call from aio_work_thread().
55e303ae 1199 */
91447636 1200
b0d623f7 1201 error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
7e4a7d39 1202 if ( error == 0 ) {
2d21ac55
A
1203 /*
1204 * got our wakeup call from aio_work_thread().
1205 * Since we can get a wakeup on this channel from another thread in the
1206 * same process we head back up to make sure this is for the correct aiocbp.
1207 * If it is the correct aiocbp we will return from where we do the check
1208 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1209 * else we will fall out and just sleep again.
1210 */
1211 goto check_for_our_aiocbp;
55e303ae 1212 }
7e4a7d39 1213 else if ( error == EWOULDBLOCK ) {
55e303ae
A
1214 /* our timeout expired */
1215 error = EAGAIN;
1216 }
1217 else {
1218 /* we were interrupted */
55e303ae
A
1219 error = EINTR;
1220 }
1221
1222ExitThisRoutine:
1223 if ( aiocbpp != NULL )
1224 FREE( aiocbpp, M_TEMP );
1225
1226 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1227 (int)p, uap->nent, error, 0, 0 );
1228
1229 return( error );
1230
1231} /* aio_suspend */
1232
1233
1234/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1235 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1236 * (uap->aiocbp->aio_buf).
1237 */
1238
1239int
2d21ac55 1240aio_write(proc_t p, struct aio_write_args *uap, int *retval )
55e303ae
A
1241{
1242 int error;
1243
1244 *retval = 0;
1245
1246 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1247 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1248
1249 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1250 if ( error != 0 )
1251 *retval = -1;
1252
1253 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1254 (int)p, (int)uap->aiocbp, error, 0, 0 );
1255
1256 return( error );
1257
1258} /* aio_write */
1259
1260
b0d623f7
A
1261static user_addr_t *
1262aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
55e303ae 1263{
b0d623f7
A
1264 user_addr_t *aiocbpp;
1265 int i, result;
55e303ae 1266
b0d623f7
A
1267 /* we reserve enough space for largest possible pointer size */
1268 MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1269 if ( aiocbpp == NULL )
1270 goto err;
1271
1272 /* copyin our aiocb pointers from list */
1273 result = copyin( aiocblist, aiocbpp,
1274 proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1275 : (nent * sizeof(user32_addr_t)) );
1276 if ( result) {
1277 FREE( aiocbpp, M_TEMP );
1278 aiocbpp = NULL;
1279 goto err;
1280 }
1281
1282 /*
1283 * We depend on a list of user_addr_t's so we need to
1284 * munge and expand when these pointers came from a
1285 * 32-bit process
1286 */
1287 if ( !proc_is64bit(procp) ) {
1288 /* copy from last to first to deal with overlap */
1289 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1290 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1291
1292 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1293 *my_addrp = (user_addr_t) (*my_ptrp);
1294 }
1295 }
1296
1297err:
1298 return (aiocbpp);
1299}
1300
1301
1302static int
1303aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1304{
1305 int result = 0;
1306
1307 if (sigp == USER_ADDR_NULL)
1308 goto out;
1309
1310 /*
1311 * We need to munge aio_sigevent since it contains pointers.
1312 * Since we do not know if sigev_value is an int or a ptr we do
1313 * NOT cast the ptr to a user_addr_t. This means if we send
1314 * this info back to user space we need to remember sigev_value
1315 * was not expanded for the 32-bit case.
1316 *
1317 * Notes: This does NOT affect us since we don't support
1318 * sigev_value yet in the aio context.
1319 */
1320 if ( proc_is64bit(procp) ) {
1321 struct user64_sigevent sigevent64;
1322
1323 result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
1324 if ( result == 0 ) {
1325 sigev->sigev_notify = sigevent64.sigev_notify;
1326 sigev->sigev_signo = sigevent64.sigev_signo;
1327 sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1328 sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1329 sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1330 }
1331
1332 } else {
1333 struct user32_sigevent sigevent32;
1334
1335 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1336 if ( result == 0 ) {
1337 sigev->sigev_notify = sigevent32.sigev_notify;
1338 sigev->sigev_signo = sigevent32.sigev_signo;
1339 sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1340 sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1341 sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1342 }
1343 }
1344
1345 if ( result != 0 ) {
1346 result = EAGAIN;
1347 }
1348
1349out:
1350 return (result);
1351}
1352
1353/*
1354 * aio_enqueue_work
1355 *
1356 * Queue up the entry on the aio asynchronous work queue in priority order
1357 * based on the relative priority of the request. We calculate the relative
1358 * priority using the nice value of the caller and the value
1359 *
1360 * Parameters: procp Process queueing the I/O
1361 * entryp The work queue entry being queued
1362 *
1363 * Returns: (void) No failure modes
1364 *
1365 * Notes: This function is used for both lio_listio and aio
1366 *
1367 * XXX: At some point, we may have to consider thread priority
1368 * rather than process priority, but we don't maintain the
1369 * adjusted priority for threads the POSIX way.
1370 *
1371 *
1372 * Called with proc locked.
1373 */
1374static void
1375aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1376{
1377#if 0
1378 aio_workq_entry *my_entryp; /* used for insertion sort */
1379#endif /* 0 */
1380 aio_workq_t queue = aio_entry_workq(entryp);
1381
1382 if (proc_locked == 0) {
1383 aio_proc_lock(procp);
1384 }
1385
1386 ASSERT_AIO_PROC_LOCK_OWNED(procp);
1387
1388 /* Onto proc queue */
1389 TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
1390 procp->p_aio_active_count++;
1391 procp->p_aio_total_count++;
1392
1393 /* And work queue */
1394 aio_workq_lock_spin(queue);
1395 aio_workq_add_entry_locked(queue, entryp);
6d2010ae 1396 wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED, -1);
b0d623f7
A
1397 aio_workq_unlock(queue);
1398
1399 if (proc_locked == 0) {
1400 aio_proc_unlock(procp);
1401 }
1402
1403#if 0
1404 /*
1405 * Procedure:
1406 *
1407 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1408 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1409 * which is [0..39], with 0 not being used. In nice values, the
1410 * lower the nice value, the higher the priority.
1411 * (3) The normalized scheduling prioritiy is the highest nice value
1412 * minus the current nice value. In I/O scheduling priority, the
1413 * higher the value the lower the priority, so it is the inverse
1414 * of the nice value (the higher the number, the higher the I/O
1415 * priority).
1416 * (4) From the normalized scheduling priority, we subtract the
1417 * request priority to get the request priority value number;
1418 * this means that requests are only capable of depressing their
1419 * priority relative to other requests,
1420 */
1421 entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1422
1423 /* only premit depressing the priority */
1424 if (entryp->aiocb.aio_reqprio < 0)
1425 entryp->aiocb.aio_reqprio = 0;
1426 if (entryp->aiocb.aio_reqprio > 0) {
1427 entryp->priority -= entryp->aiocb.aio_reqprio;
1428 if (entryp->priority < 0)
1429 entryp->priority = 0;
1430 }
1431
1432 /* Insertion sort the entry; lowest ->priority to highest */
1433 TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1434 if ( entryp->priority <= my_entryp->priority) {
1435 TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1436 break;
1437 }
1438 }
1439 if (my_entryp == NULL)
1440 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1441#endif /* 0 */
1442}
1443
1444
1445/*
1446 * lio_listio - initiate a list of IO requests. We process the list of
1447 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1448 * (mode == LIO_NOWAIT).
1449 *
1450 * The caller gets error and return status for each aiocb in the list
1451 * via aio_error and aio_return. We must keep completed requests until
1452 * released by the aio_return call.
1453 */
1454int
1455lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1456{
1457 int i;
1458 int call_result;
1459 int result;
1460 int old_count;
1461 aio_workq_entry **entryp_listp;
1462 user_addr_t *aiocbpp;
1463 struct user_sigevent aiosigev;
1464 aio_lio_context *lio_context;
1465 boolean_t free_context = FALSE;
1466
1467 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1468 (int)p, uap->nent, uap->mode, 0, 0 );
55e303ae
A
1469
1470 entryp_listp = NULL;
b0d623f7 1471 lio_context = NULL;
91447636 1472 aiocbpp = NULL;
55e303ae
A
1473 call_result = -1;
1474 *retval = -1;
1475 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1476 call_result = EINVAL;
1477 goto ExitRoutine;
1478 }
1479
1480 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1481 call_result = EINVAL;
1482 goto ExitRoutine;
1483 }
55e303ae
A
1484
1485 /*
b0d623f7
A
1486 * allocate a list of aio_workq_entry pointers that we will use
1487 * to queue up all our requests at once while holding our lock.
55e303ae 1488 */
91447636 1489 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
55e303ae
A
1490 if ( entryp_listp == NULL ) {
1491 call_result = EAGAIN;
1492 goto ExitRoutine;
1493 }
b0d623f7
A
1494
1495 MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1496 if ( lio_context == NULL ) {
91447636
A
1497 call_result = EAGAIN;
1498 goto ExitRoutine;
1499 }
1500
b0d623f7
A
1501#if DEBUG
1502 OSIncrementAtomic(&lio_contexts_alloced);
1503#endif /* DEBUG */
1504
1505 bzero(lio_context, sizeof(aio_lio_context));
1506
1507 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1508 if ( aiocbpp == NULL ) {
91447636
A
1509 call_result = EAGAIN;
1510 goto ExitRoutine;
1511 }
b0d623f7
A
1512
1513 /*
1514 * Use sigevent passed in to lio_listio for each of our calls, but
1515 * only do completion notification after the last request completes.
1516 */
1517 bzero(&aiosigev, sizeof(aiosigev));
1518 /* Only copy in an sigev if the user supplied one */
1519 if (uap->sigp != USER_ADDR_NULL) {
1520 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1521 if ( call_result)
1522 goto ExitRoutine;
91447636
A
1523 }
1524
55e303ae 1525 /* process list of aio requests */
b0d623f7
A
1526 lio_context->io_issued = uap->nent;
1527 lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
55e303ae 1528 for ( i = 0; i < uap->nent; i++ ) {
91447636 1529 user_addr_t my_aiocbp;
b0d623f7 1530 aio_workq_entry *entryp;
55e303ae
A
1531
1532 *(entryp_listp + i) = NULL;
91447636 1533 my_aiocbp = *(aiocbpp + i);
55e303ae 1534
55e303ae 1535 /* NULL elements are legal so check for 'em */
b0d623f7
A
1536 if ( my_aiocbp == USER_ADDR_NULL ) {
1537 aio_proc_lock_spin(p);
1538 lio_context->io_issued--;
1539 aio_proc_unlock(p);
55e303ae 1540 continue;
b0d623f7 1541 }
55e303ae 1542
b0d623f7
A
1543 /*
1544 * We use lio_context to mark IO requests for delayed completion
1545 * processing which means we wait until all IO requests in the
1546 * group have completed before we either return to the caller
1547 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1548 *
1549 * We use the address of the lio_context for this, since it is
1550 * unique in the address space.
1551 */
1552 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
55e303ae
A
1553 if ( result != 0 && call_result == -1 )
1554 call_result = result;
55e303ae
A
1555
1556 /* NULL elements are legal so check for 'em */
1557 entryp = *(entryp_listp + i);
b0d623f7
A
1558 if ( entryp == NULL ) {
1559 aio_proc_lock_spin(p);
1560 lio_context->io_issued--;
1561 aio_proc_unlock(p);
55e303ae 1562 continue;
b0d623f7
A
1563 }
1564
1565 if ( uap->mode == LIO_NOWAIT ) {
1566 /* Set signal hander, if any */
1567 entryp->aiocb.aio_sigevent = aiosigev;
1568 } else {
1569 /* flag that this thread blocks pending completion */
1570 entryp->flags |= AIO_LIO_NOTIFY;
1571 }
55e303ae
A
1572
1573 /* check our aio limits to throttle bad or rude user land behavior */
b0d623f7
A
1574 old_count = aio_increment_total_count();
1575
1576 aio_proc_lock_spin(p);
1577 if ( old_count >= aio_max_requests ||
55e303ae
A
1578 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1579 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
55e303ae 1580
b0d623f7
A
1581 lio_context->io_issued--;
1582 aio_proc_unlock(p);
1583
1584 aio_decrement_total_count();
1585
91447636 1586 if ( call_result == -1 )
b0d623f7
A
1587 call_result = EAGAIN;
1588 aio_free_request(entryp);
1589 entryp_listp[i] = NULL;
55e303ae
A
1590 continue;
1591 }
1592
b0d623f7
A
1593 lck_mtx_convert_spin(aio_proc_mutex(p));
1594 aio_enqueue_work(p, entryp, 1);
1595 aio_proc_unlock(p);
1596
1597 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1598 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
91447636 1599 }
55e303ae 1600
b0d623f7
A
1601 switch(uap->mode) {
1602 case LIO_WAIT:
1603 aio_proc_lock_spin(p);
1604 while (lio_context->io_completed < lio_context->io_issued) {
1605 result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
55e303ae 1606
b0d623f7
A
1607 /* If we were interrupted, fail out (even if all finished) */
1608 if (result != 0) {
1609 call_result = EINTR;
1610 lio_context->io_waiter = 0;
1611 break;
1612 }
1613 }
1614
1615 /* If all IOs have finished must free it */
1616 if (lio_context->io_completed == lio_context->io_issued) {
1617 free_context = TRUE;
1618 }
55e303ae 1619
b0d623f7
A
1620 aio_proc_unlock(p);
1621 break;
1622
1623 case LIO_NOWAIT:
1624 break;
1625 }
1626
55e303ae
A
1627 /* call_result == -1 means we had no trouble queueing up requests */
1628 if ( call_result == -1 ) {
1629 call_result = 0;
1630 *retval = 0;
1631 }
1632
1633ExitRoutine:
1634 if ( entryp_listp != NULL )
1635 FREE( entryp_listp, M_TEMP );
91447636
A
1636 if ( aiocbpp != NULL )
1637 FREE( aiocbpp, M_TEMP );
b0d623f7
A
1638 if ((lio_context != NULL) && ((lio_context->io_issued == 0) || (free_context == TRUE))) {
1639 free_lio_context(lio_context);
1640 }
1641
55e303ae
A
1642 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1643 (int)p, call_result, 0, 0, 0 );
1644
1645 return( call_result );
1646
1647} /* lio_listio */
1648
1649
1650/*
1651 * aio worker thread. this is where all the real work gets done.
1652 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1653 * after new work is queued up.
1654 */
55e303ae
A
1655static void
1656aio_work_thread( void )
1657{
1658 aio_workq_entry *entryp;
b0d623f7
A
1659 int error;
1660 vm_map_t currentmap;
1661 vm_map_t oldmap = VM_MAP_NULL;
1662 task_t oldaiotask = TASK_NULL;
1663 struct uthread *uthreadp = NULL;
55e303ae
A
1664
1665 for( ;; ) {
b0d623f7
A
1666 /*
1667 * returns with the entry ref'ed.
1668 * sleeps until work is available.
1669 */
1670 entryp = aio_get_some_work();
1671
1672 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1673 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1674
1675 /*
1676 * Assume the target's address space identity for the duration
1677 * of the IO. Note: don't need to have the entryp locked,
1678 * because the proc and map don't change until it's freed.
1679 */
1680 currentmap = get_task_map( (current_proc())->task );
1681 if ( currentmap != entryp->aio_map ) {
1682 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1683 oldaiotask = uthreadp->uu_aio_task;
1684 uthreadp->uu_aio_task = entryp->procp->task;
1685 oldmap = vm_map_switch( entryp->aio_map );
1686 }
1687
1688 if ( (entryp->flags & AIO_READ) != 0 ) {
1689 error = do_aio_read( entryp );
1690 }
1691 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1692 error = do_aio_write( entryp );
1693 }
1694 else if ( (entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0 ) {
1695 error = do_aio_fsync( entryp );
1696 }
55e303ae 1697 else {
b0d623f7
A
1698 printf( "%s - unknown aio request - flags 0x%02X \n",
1699 __FUNCTION__, entryp->flags );
1700 error = EINVAL;
1701 }
91447636 1702
b0d623f7
A
1703 /* Restore old map */
1704 if ( currentmap != entryp->aio_map ) {
1705 (void) vm_map_switch( oldmap );
1706 uthreadp->uu_aio_task = oldaiotask;
1707 }
55e303ae 1708
b0d623f7
A
1709 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1710 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1711 entryp->returnval, 0 );
1712
1713
1714 /* XXX COUNTS */
1715 aio_entry_lock_spin(entryp);
1716 entryp->errorval = error;
1717 aio_entry_unlock(entryp);
1718
1719 /* we're done with the IO request so pop it off the active queue and */
1720 /* push it on the done queue */
1721 aio_proc_lock(entryp->procp);
1722 aio_proc_move_done_locked(entryp->procp, entryp);
1723 aio_proc_unlock(entryp->procp);
1724
1725 OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1726
1727 /* remove our reference to the user land map. */
1728 if ( VM_MAP_NULL != entryp->aio_map ) {
1729 vm_map_t my_map;
1730
1731 my_map = entryp->aio_map;
1732 entryp->aio_map = VM_MAP_NULL;
1733 vm_map_deallocate( my_map );
55e303ae 1734 }
b0d623f7
A
1735
1736 /* Provide notifications */
1737 do_aio_completion( entryp );
1738
1739 /* Will free if needed */
1740 aio_entry_unref(entryp);
1741
55e303ae
A
1742 } /* for ( ;; ) */
1743
1744 /* NOT REACHED */
1745
1746} /* aio_work_thread */
1747
1748
1749/*
1750 * aio_get_some_work - get the next async IO request that is ready to be executed.
1751 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1752 * IO requests at the time the aio_fsync call came in have completed.
91447636 1753 * NOTE - AIO_LOCK must be held by caller
55e303ae 1754 */
55e303ae
A
1755static aio_workq_entry *
1756aio_get_some_work( void )
1757{
b0d623f7
A
1758 aio_workq_entry *entryp = NULL;
1759 aio_workq_t queue = NULL;
1760
1761 /* Just one queue for the moment. In the future there will be many. */
1762 queue = &aio_anchor.aio_async_workqs[0];
1763 aio_workq_lock_spin(queue);
1764 if (queue->aioq_count == 0) {
1765 goto nowork;
1766 }
1767
1768 /*
1769 * Hold the queue lock.
1770 *
1771 * pop some work off the work queue and add to our active queue
1772 * Always start with the queue lock held.
1773 */
1774 for(;;) {
1775 /*
1776 * Pull of of work queue. Once it's off, it can't be cancelled,
1777 * so we can take our ref once we drop the queue lock.
1778 */
1779 entryp = TAILQ_FIRST(&queue->aioq_entries);
55e303ae 1780
b0d623f7
A
1781 /*
1782 * If there's no work or only fsyncs that need delay, go to sleep
1783 * and then start anew from aio_work_thread
1784 */
1785 if (entryp == NULL) {
1786 goto nowork;
1787 }
1788
1789 aio_workq_remove_entry_locked(queue, entryp);
1790
1791 aio_workq_unlock(queue);
1792
1793 /*
1794 * Check if it's an fsync that must be delayed. No need to lock the entry;
1795 * that flag would have been set at initialization.
1796 */
55e303ae 1797 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
b0d623f7
A
1798 /*
1799 * Check for unfinished operations on the same file
1800 * in this proc's queue.
1801 */
1802 aio_proc_lock_spin(entryp->procp);
55e303ae 1803 if ( aio_delay_fsync_request( entryp ) ) {
b0d623f7 1804 /* It needs to be delayed. Put it back on the end of the work queue */
55e303ae
A
1805 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1806 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
b0d623f7
A
1807
1808 aio_proc_unlock(entryp->procp);
1809
1810 aio_workq_lock_spin(queue);
1811 aio_workq_add_entry_locked(queue, entryp);
55e303ae 1812 continue;
b0d623f7
A
1813 }
1814 aio_proc_unlock(entryp->procp);
55e303ae 1815 }
b0d623f7 1816
55e303ae
A
1817 break;
1818 }
b0d623f7
A
1819
1820 aio_entry_ref(entryp);
1821
1822 OSIncrementAtomic(&aio_anchor.aio_inflight_count);
55e303ae 1823 return( entryp );
55e303ae 1824
b0d623f7
A
1825nowork:
1826 /* We will wake up when someone enqueues something */
1827 wait_queue_assert_wait(queue->aioq_waitq, queue, THREAD_UNINT, 0);
1828 aio_workq_unlock(queue);
1829 thread_block( (thread_continue_t)aio_work_thread );
1830
1831 // notreached
1832 return NULL;
1833}
55e303ae
A
1834
1835/*
b0d623f7
A
1836 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1837 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1838 * not been completed.
55e303ae
A
1839 */
1840static boolean_t
1841aio_delay_fsync_request( aio_workq_entry *entryp )
1842{
b0d623f7
A
1843 if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1844 return FALSE;
55e303ae
A
1845 }
1846
b0d623f7 1847 return TRUE;
55e303ae
A
1848} /* aio_delay_fsync_request */
1849
b0d623f7
A
1850static aio_workq_entry *
1851aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
55e303ae 1852{
b0d623f7
A
1853 aio_workq_entry *entryp;
1854 int result = 0;
55e303ae
A
1855
1856 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1857 if ( entryp == NULL ) {
1858 result = EAGAIN;
1859 goto error_exit;
1860 }
91447636 1861
55e303ae
A
1862 bzero( entryp, sizeof(*entryp) );
1863
1864 /* fill in the rest of the aio_workq_entry */
1865 entryp->procp = procp;
1866 entryp->uaiocbp = aiocbp;
b0d623f7 1867 entryp->flags |= kindOfIO;
55e303ae
A
1868 entryp->group_tag = group_tag;
1869 entryp->aio_map = VM_MAP_NULL;
b0d623f7 1870 entryp->aio_refcount = 0;
91447636 1871
b0d623f7
A
1872 if ( proc_is64bit(procp) ) {
1873 struct user64_aiocb aiocb64;
1874
1875 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
1876 if (result == 0 )
1877 do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1878
1879 } else {
1880 struct user32_aiocb aiocb32;
1881
91447636
A
1882 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1883 if ( result == 0 )
b0d623f7 1884 do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
55e303ae
A
1885 }
1886
b0d623f7
A
1887 if ( result != 0 ) {
1888 result = EAGAIN;
55e303ae 1889 goto error_exit;
b0d623f7 1890 }
55e303ae
A
1891
1892 /* get a reference to the user land map in order to keep it around */
1893 entryp->aio_map = get_task_map( procp->task );
1894 vm_map_reference( entryp->aio_map );
b0d623f7
A
1895
1896 /* do some more validation on the aiocb and embedded file descriptor */
1897 result = aio_validate( entryp );
39236c6e
A
1898 if ( result != 0 )
1899 goto error_exit_with_ref;
1900
1901 /* get a reference on the current_thread, which is passed in vfs_context. */
1902 entryp->thread = current_thread();
1903 thread_reference( entryp->thread );
1904 return ( entryp );
b0d623f7 1905
39236c6e
A
1906error_exit_with_ref:
1907 if ( VM_MAP_NULL != entryp->aio_map ) {
1908 vm_map_deallocate( entryp->aio_map );
1909 }
55e303ae 1910error_exit:
b0d623f7 1911 if ( result && entryp != NULL ) {
91447636 1912 zfree( aio_workq_zonep, entryp );
b0d623f7
A
1913 entryp = NULL;
1914 }
1915
1916 return ( entryp );
1917}
55e303ae
A
1918
1919
1920/*
b0d623f7
A
1921 * aio_queue_async_request - queue up an async IO request on our work queue then
1922 * wake up one of our worker threads to do the actual work. We get a reference
1923 * to our caller's user land map in order to keep it around while we are
1924 * processing the request.
55e303ae 1925 */
b0d623f7
A
1926static int
1927aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
55e303ae 1928{
b0d623f7
A
1929 aio_workq_entry *entryp;
1930 int result;
1931 int old_count;
55e303ae 1932
b0d623f7
A
1933 old_count = aio_increment_total_count();
1934 if (old_count >= aio_max_requests) {
1935 result = EAGAIN;
1936 goto error_noalloc;
55e303ae 1937 }
b0d623f7
A
1938
1939 entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1940 if ( entryp == NULL ) {
1941 result = EAGAIN;
1942 goto error_noalloc;
55e303ae 1943 }
55e303ae
A
1944
1945
b0d623f7 1946 aio_proc_lock_spin(procp);
55e303ae 1947
b0d623f7
A
1948 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1949 result = EAGAIN;
1950 goto error_exit;
1951 }
55e303ae 1952
b0d623f7
A
1953 /* check our aio limits to throttle bad or rude user land behavior */
1954 if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
1955 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
55e303ae
A
1956 result = EAGAIN;
1957 goto error_exit;
1958 }
b0d623f7
A
1959
1960 /* Add the IO to proc and work queues, wake up threads as appropriate */
1961 lck_mtx_convert_spin(aio_proc_mutex(procp));
1962 aio_enqueue_work(procp, entryp, 1);
1963
1964 aio_proc_unlock(procp);
1965
1966 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1967 (int)procp, (int)aiocbp, 0, 0, 0 );
55e303ae 1968
b0d623f7
A
1969 return( 0 );
1970
1971error_exit:
1972 /*
1973 * This entry has not been queued up so no worries about
1974 * unlocked state and aio_map
1975 */
1976 aio_proc_unlock(procp);
1977 aio_free_request(entryp);
91447636 1978
b0d623f7
A
1979error_noalloc:
1980 aio_decrement_total_count();
91447636 1981
b0d623f7
A
1982 return( result );
1983
1984} /* aio_queue_async_request */
91447636 1985
b0d623f7
A
1986
1987/*
1988 * lio_create_entry
1989 *
1990 * Allocate an aio_workq_entry and fill it in. If all goes well return 0
1991 * and pass the aio_workq_entry pointer back to our caller.
1992 *
1993 * Parameters: procp The process makign the request
1994 * aiocbp The aio context buffer pointer
1995 * group_tag The group tag used to indicate a
1996 * group of operations has completed
1997 * entrypp Pointer to the pointer to receive the
1998 * address of the created aio_workq_entry
1999 *
2000 * Returns: 0 Successfully created
2001 * EAGAIN Try again (usually resource shortage)
2002 *
2003 *
2004 * Notes: We get a reference to our caller's user land map in order
2005 * to keep it around while we are processing the request.
2006 *
2007 * lio_listio calls behave differently at completion they do
2008 * completion notification when all async IO requests have
2009 * completed. We use group_tag to tag IO requests that behave
2010 * in the delay notification manner.
2011 *
2012 * All synchronous operations are considered to not have a
2013 * signal routine associated with them (sigp == USER_ADDR_NULL).
2014 */
2015static int
2016lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2017 aio_workq_entry **entrypp )
2018{
2019 aio_workq_entry *entryp;
2020 int result;
2021
2022 entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2023 if ( entryp == NULL ) {
2024 result = EAGAIN;
55e303ae
A
2025 goto error_exit;
2026 }
2027
b0d623f7
A
2028 /*
2029 * Look for lio_listio LIO_NOP requests and ignore them; this is
2030 * not really an error, but we need to free our aio_workq_entry.
2031 */
55e303ae
A
2032 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
2033 result = 0;
2034 goto error_exit;
2035 }
2036
55e303ae
A
2037 *entrypp = entryp;
2038 return( 0 );
2039
2040error_exit:
b0d623f7
A
2041
2042 if ( entryp != NULL ) {
2043 /*
2044 * This entry has not been queued up so no worries about
2045 * unlocked state and aio_map
2046 */
2047 aio_free_request(entryp);
2048 }
55e303ae
A
2049
2050 return( result );
2051
b0d623f7 2052} /* lio_create_entry */
55e303ae
A
2053
2054
2055/*
2056 * aio_free_request - remove our reference on the user land map and
b0d623f7
A
2057 * free the work queue entry resources. The entry is off all lists
2058 * and has zero refcount, so no one can have a pointer to it.
55e303ae
A
2059 */
2060
2061static int
b0d623f7 2062aio_free_request(aio_workq_entry *entryp)
55e303ae
A
2063{
2064 /* remove our reference to the user land map. */
b0d623f7
A
2065 if ( VM_MAP_NULL != entryp->aio_map) {
2066 vm_map_deallocate(entryp->aio_map);
55e303ae 2067 }
b0d623f7 2068
39236c6e
A
2069 /* remove our reference to thread which enqueued the request */
2070 if ( NULL != entryp->thread ) {
2071 thread_deallocate( entryp->thread );
2072 }
2073
b0d623f7
A
2074 entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2075
91447636 2076 zfree( aio_workq_zonep, entryp );
55e303ae
A
2077
2078 return( 0 );
2079
2080} /* aio_free_request */
2081
2082
b0d623f7
A
2083/*
2084 * aio_validate
2085 *
2086 * validate the aiocb passed in by one of the aio syscalls.
55e303ae 2087 */
55e303ae
A
2088static int
2089aio_validate( aio_workq_entry *entryp )
2090{
91447636 2091 struct fileproc *fp;
55e303ae
A
2092 int flag;
2093 int result;
2094
2095 result = 0;
2096
2097 if ( (entryp->flags & AIO_LIO) != 0 ) {
2098 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
2099 entryp->flags |= AIO_READ;
2100 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
2101 entryp->flags |= AIO_WRITE;
2102 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
2103 return( 0 );
2104 else
2105 return( EINVAL );
2106 }
2107
2108 flag = FREAD;
b0d623f7 2109 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0 ) {
55e303ae
A
2110 flag = FWRITE;
2111 }
2112
2113 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
91447636
A
2114 if ( entryp->aiocb.aio_nbytes > INT_MAX ||
2115 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2116 entryp->aiocb.aio_offset < 0 )
55e303ae
A
2117 return( EINVAL );
2118 }
2119
b0d623f7
A
2120 /*
2121 * validate aiocb.aio_sigevent. at this point we only support
2122 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
2123 * sigev_value, sigev_notify_function, and sigev_notify_attributes
2124 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
2125 * with no [RTS] (RalTime Signal) option group support.
55e303ae 2126 */
b0d623f7
A
2127 switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
2128 case SIGEV_SIGNAL:
2129 {
55e303ae 2130 int signum;
b0d623f7 2131
55e303ae
A
2132 /* make sure we have a valid signal number */
2133 signum = entryp->aiocb.aio_sigevent.sigev_signo;
2134 if ( signum <= 0 || signum >= NSIG ||
2135 signum == SIGKILL || signum == SIGSTOP )
2136 return (EINVAL);
b0d623f7
A
2137 }
2138 break;
2139
2140 case SIGEV_NONE:
2141 break;
2142
2143 case SIGEV_THREAD:
2144 /* Unsupported [RTS] */
2145
2146 default:
55e303ae 2147 return (EINVAL);
b0d623f7 2148 }
55e303ae
A
2149
2150 /* validate the file descriptor and that the file was opened
91447636 2151 * for the appropriate read / write access.
55e303ae 2152 */
91447636 2153 proc_fdlock(entryp->procp);
55e303ae 2154
91447636 2155 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
55e303ae 2156 if ( result == 0 ) {
91447636 2157 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
55e303ae
A
2158 /* we don't have read or write access */
2159 result = EBADF;
2160 }
39236c6e 2161 else if ( FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE ) {
55e303ae
A
2162 /* this is not a file */
2163 result = ESPIPE;
91447636
A
2164 } else
2165 fp->f_flags |= FP_AIOISSUED;
2166
2167 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
55e303ae
A
2168 }
2169 else {
2170 result = EBADF;
2171 }
2172
91447636 2173 proc_fdunlock(entryp->procp);
55e303ae
A
2174
2175 return( result );
2176
2177} /* aio_validate */
2178
b0d623f7
A
2179static int
2180aio_increment_total_count()
2181{
2182 return OSIncrementAtomic(&aio_anchor.aio_total_count);
2183}
2184
2185static int
2186aio_decrement_total_count()
2187{
2188 int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2189 if (old <= 0) {
2190 panic("Negative total AIO count!\n");
2191 }
55e303ae 2192
b0d623f7
A
2193 return old;
2194}
55e303ae
A
2195
2196static int
2d21ac55 2197aio_get_process_count(proc_t procp )
55e303ae 2198{
b0d623f7 2199 return procp->p_aio_total_count;
55e303ae
A
2200
2201} /* aio_get_process_count */
2202
55e303ae
A
2203static int
2204aio_get_all_queues_count( void )
2205{
b0d623f7 2206 return aio_anchor.aio_total_count;
55e303ae
A
2207
2208} /* aio_get_all_queues_count */
2209
2210
2211/*
2212 * do_aio_completion. Handle async IO completion.
2213 */
55e303ae
A
2214static void
2215do_aio_completion( aio_workq_entry *entryp )
2216{
b0d623f7
A
2217
2218 boolean_t lastLioCompleted = FALSE;
2219 aio_lio_context *lio_context = NULL;
2220 int waiter = 0;
2221
2222 lio_context = (aio_lio_context *)entryp->group_tag;
2223
2224 if (lio_context != NULL) {
2225
2226 aio_proc_lock_spin(entryp->procp);
2227
2228 /* Account for this I/O completing. */
2229 lio_context->io_completed++;
2230
2231 /* Are we done with this lio context? */
2232 if (lio_context->io_issued == lio_context->io_completed) {
2233 lastLioCompleted = TRUE;
2234 }
2235
2236 waiter = lio_context->io_waiter;
2237
2238 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2239 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2240 /* wake up the waiter */
2241 wakeup(lio_context);
2242 }
2243
2244 aio_proc_unlock(entryp->procp);
2245 }
2246
55e303ae
A
2247 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2248 (entryp->flags & AIO_DISABLE) == 0 ) {
b0d623f7
A
2249
2250 boolean_t performSignal = FALSE;
2251 if (lio_context == NULL) {
2252 performSignal = TRUE;
2253 }
2254 else {
2255 /*
2256 * If this was the last request in the group and a signal
2257 * is desired, send one.
2258 */
2259 performSignal = lastLioCompleted;
2260 }
2261
2262 if (performSignal) {
2263
55e303ae 2264 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
b0d623f7
A
2265 (int)entryp->procp, (int)entryp->uaiocbp,
2266 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
55e303ae
A
2267
2268 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
55e303ae
A
2269 }
2270 }
2271
b0d623f7
A
2272 if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2273 panic("Close and exit flags set at the same time\n");
2274 }
2275
55e303ae 2276 /*
b0d623f7
A
2277 * need to handle case where a process is trying to exit, exec, or
2278 * close and is currently waiting for active aio requests to complete.
2279 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
55e303ae 2280 * other requests in the active queue for this process. If there are
b0d623f7
A
2281 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2282 * If there are some still active then do nothing - we only want to
2283 * wakeup when all active aio requests for the process are complete.
2284 *
2285 * Don't need to lock the entry or proc to check the cleanup flag. It can only be
2286 * set for cancellation, while the entryp is still on a proc list; now it's
2287 * off, so that flag is already set if it's going to be.
55e303ae 2288 */
b0d623f7 2289 if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) {
55e303ae
A
2290 int active_requests;
2291
2292 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2293 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2294
b0d623f7 2295 aio_proc_lock_spin(entryp->procp);
55e303ae 2296 active_requests = aio_active_requests_for_process( entryp->procp );
55e303ae 2297 if ( active_requests < 1 ) {
b0d623f7
A
2298 /*
2299 * no active aio requests for this process, continue exiting. In this
2300 * case, there should be no one else waiting ont he proc in AIO...
2301 */
2302 wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2303 aio_proc_unlock(entryp->procp);
55e303ae
A
2304
2305 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2306 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
b0d623f7
A
2307 } else {
2308 aio_proc_unlock(entryp->procp);
55e303ae 2309 }
55e303ae 2310 }
b0d623f7
A
2311
2312 if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) {
2313 int active_requests;
55e303ae 2314
b0d623f7
A
2315 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2316 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2317
2318 aio_proc_lock_spin(entryp->procp);
2319 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2320 if ( active_requests < 1 ) {
2321 /* Can't wakeup_one(); multiple closes might be in progress. */
2322 wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2323 aio_proc_unlock(entryp->procp);
2324
2325 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2326 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2327 } else {
2328 aio_proc_unlock(entryp->procp);
2329 }
2330 }
55e303ae 2331 /*
b0d623f7
A
2332 * A thread in aio_suspend() wants to known about completed IOs. If it checked
2333 * the done list before we moved our AIO there, then it already asserted its wait,
2334 * and we can wake it up without holding the lock. If it checked the list after
2335 * we did our move, then it already has seen the AIO that we moved. Herego, we
2336 * can do our wakeup without holding the lock.
55e303ae 2337 */
b0d623f7 2338 wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
55e303ae
A
2339 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2340 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
55e303ae 2341
b0d623f7
A
2342 /*
2343 * free the LIO context if the last lio completed and no thread is
2344 * waiting
2345 */
2346 if (lastLioCompleted && (waiter == 0))
2347 free_lio_context (lio_context);
55e303ae 2348
55e303ae 2349
b0d623f7 2350} /* do_aio_completion */
55e303ae
A
2351
2352
2353/*
2354 * do_aio_read
2355 */
2356static int
2357do_aio_read( aio_workq_entry *entryp )
2358{
2d21ac55
A
2359 struct fileproc *fp;
2360 int error;
2361 struct vfs_context context;
55e303ae 2362
91447636
A
2363 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2364 return(error);
2365 if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2366 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2367 return(EBADF);
2368 }
2d21ac55 2369
39236c6e 2370 context.vc_thread = entryp->thread; /* XXX */
2d21ac55
A
2371 context.vc_ucred = fp->f_fglob->fg_cred;
2372
2373 error = dofileread(&context, fp,
2374 entryp->aiocb.aio_buf,
2375 entryp->aiocb.aio_nbytes,
2376 entryp->aiocb.aio_offset, FOF_OFFSET,
2377 &entryp->returnval);
2378 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
55e303ae
A
2379
2380 return( error );
2381
2382} /* do_aio_read */
2383
2384
2385/*
2386 * do_aio_write
2387 */
2388static int
2389do_aio_write( aio_workq_entry *entryp )
2390{
91447636 2391 struct fileproc *fp;
b0d623f7 2392 int error, flags;
2d21ac55 2393 struct vfs_context context;
55e303ae 2394
91447636
A
2395 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2396 return(error);
2397 if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2398 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2399 return(EBADF);
2400 }
2d21ac55 2401
b0d623f7
A
2402 flags = FOF_PCRED;
2403 if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) {
2404 flags |= FOF_OFFSET;
2405 }
2406
39236c6e 2407 context.vc_thread = entryp->thread; /* XXX */
2d21ac55
A
2408 context.vc_ucred = fp->f_fglob->fg_cred;
2409
2410 /* NB: tell dofilewrite the offset, and to use the proc cred */
2411 error = dofilewrite(&context,
2412 fp,
2413 entryp->aiocb.aio_buf,
2414 entryp->aiocb.aio_nbytes,
2415 entryp->aiocb.aio_offset,
b0d623f7 2416 flags,
2d21ac55 2417 &entryp->returnval);
fe8ab488
A
2418
2419 if (entryp->returnval)
2420 fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
2421 else
2422 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
55e303ae
A
2423
2424 return( error );
2425
2426} /* do_aio_write */
2427
2428
2429/*
2430 * aio_active_requests_for_process - return number of active async IO
2431 * requests for the given process.
55e303ae 2432 */
55e303ae 2433static int
2d21ac55 2434aio_active_requests_for_process(proc_t procp )
55e303ae 2435{
b0d623f7
A
2436 return( procp->p_aio_active_count );
2437
2438} /* aio_active_requests_for_process */
2439
2440/*
2441 * Called with the proc locked.
2442 */
2443static int
2444aio_proc_active_requests_for_file(proc_t procp, int fd)
2445{
2446 int count = 0;
2447 aio_workq_entry *entryp;
2448 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2449 if (entryp->aiocb.aio_fildes == fd) {
2450 count++;
2451 }
2452 }
55e303ae 2453
b0d623f7 2454 return count;
55e303ae
A
2455} /* aio_active_requests_for_process */
2456
2457
b0d623f7 2458
55e303ae
A
2459/*
2460 * do_aio_fsync
2461 */
2462static int
2463do_aio_fsync( aio_workq_entry *entryp )
2464{
91447636
A
2465 struct vfs_context context;
2466 struct vnode *vp;
2467 struct fileproc *fp;
b0d623f7
A
2468 int sync_flag;
2469 int error;
91447636 2470
b0d623f7
A
2471 /*
2472 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2473 *
2474 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2475 * to mark for update the metadata not strictly necessary for data
2476 * retrieval, rather than forcing it to disk.
2477 *
2478 * If AIO_FSYNC is set, we have to also wait for metadata not really
2479 * necessary to data retrival are committed to stable storage (e.g.
2480 * atime, mtime, ctime, etc.).
2481 *
2482 * Metadata necessary for data retrieval ust be committed to stable
2483 * storage in either case (file length, etc.).
2484 */
2485 if (entryp->flags & AIO_FSYNC)
2486 sync_flag = MNT_WAIT;
2487 else
2488 sync_flag = MNT_DWAIT;
2489
91447636 2490 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
55e303ae 2491 if ( error == 0 ) {
91447636
A
2492 if ( (error = vnode_getwithref(vp)) ) {
2493 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2494 entryp->returnval = -1;
2495 return(error);
2496 }
2d21ac55 2497 context.vc_thread = current_thread();
91447636
A
2498 context.vc_ucred = fp->f_fglob->fg_cred;
2499
b0d623f7 2500 error = VNOP_FSYNC( vp, sync_flag, &context);
91447636
A
2501
2502 (void)vnode_put(vp);
2503
2504 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
55e303ae
A
2505 }
2506 if ( error != 0 )
2507 entryp->returnval = -1;
2508
2509 return( error );
2510
2511} /* do_aio_fsync */
2512
2513
2514/*
2515 * is_already_queued - runs through our queues to see if the given
2516 * aiocbp / process is there. Returns TRUE if there is a match
2517 * on any of our aio queues.
b0d623f7
A
2518 *
2519 * Called with proc aio lock held (can be held spin)
55e303ae 2520 */
55e303ae 2521static boolean_t
2d21ac55 2522is_already_queued(proc_t procp,
91447636 2523 user_addr_t aiocbp )
55e303ae
A
2524{
2525 aio_workq_entry *entryp;
2526 boolean_t result;
b0d623f7 2527
55e303ae
A
2528 result = FALSE;
2529
2530 /* look for matches on our queue of async IO requests that have completed */
b0d623f7 2531 TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
55e303ae
A
2532 if ( aiocbp == entryp->uaiocbp ) {
2533 result = TRUE;
2534 goto ExitThisRoutine;
2535 }
2536 }
2537
2538 /* look for matches on our queue of active async IO requests */
b0d623f7 2539 TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
55e303ae
A
2540 if ( aiocbp == entryp->uaiocbp ) {
2541 result = TRUE;
2542 goto ExitThisRoutine;
2543 }
2544 }
2545
55e303ae
A
2546ExitThisRoutine:
2547 return( result );
2548
2549} /* is_already_queued */
2550
2551
b0d623f7
A
2552static void
2553free_lio_context(aio_lio_context* context)
2554{
2555
2556#if DEBUG
2557 OSDecrementAtomic(&lio_contexts_alloced);
2558#endif /* DEBUG */
2559
2560 FREE( context, M_TEMP );
2561
2562} /* free_lio_context */
2563
2564
55e303ae
A
2565/*
2566 * aio initialization
2567 */
2568__private_extern__ void
2569aio_init( void )
2570{
2571 int i;
2572
91447636 2573 aio_lock_grp_attr = lck_grp_attr_alloc_init();
b0d623f7
A
2574 aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2575 aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2576 aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
91447636 2577 aio_lock_attr = lck_attr_alloc_init();
91447636 2578
b0d623f7
A
2579 lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2580 lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
55e303ae 2581
b0d623f7 2582 aio_anchor.aio_inflight_count = 0;
55e303ae 2583 aio_anchor.aio_done_count = 0;
b0d623f7
A
2584 aio_anchor.aio_total_count = 0;
2585 aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2586
2587 for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2588 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2589 }
2590
55e303ae
A
2591
2592 i = sizeof( aio_workq_entry );
2593 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2594
2595 _aio_create_worker_threads( aio_worker_threads );
55e303ae
A
2596
2597} /* aio_init */
2598
2599
2600/*
2601 * aio worker threads created here.
2602 */
2603__private_extern__ void
2604_aio_create_worker_threads( int num )
2605{
2606 int i;
2607
2608 /* create some worker threads to handle the async IO requests */
2609 for ( i = 0; i < num; i++ ) {
2610 thread_t myThread;
2611
b0d623f7 2612 if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
55e303ae
A
2613 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2614 }
b0d623f7
A
2615 else
2616 thread_deallocate(myThread);
55e303ae
A
2617 }
2618
2619 return;
2620
2621} /* _aio_create_worker_threads */
2622
2623/*
2624 * Return the current activation utask
2625 */
2626task_t
2627get_aiotask(void)
2628{
91447636
A
2629 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2630}
2631
2632
2633/*
2634 * In the case of an aiocb from a
2635 * 32-bit process we need to expand some longs and pointers to the correct
2636 * sizes in order to let downstream code always work on the same type of
2637 * aiocb (in our case that is a user_aiocb)
2638 */
2639static void
b0d623f7 2640do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
91447636
A
2641{
2642 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2643 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2644 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2645 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2646 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2647 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2648
2649 /* special case here. since we do not know if sigev_value is an */
2650 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2651 /* means if we send this info back to user space we need to remember */
2652 /* sigev_value was not expanded for the 32-bit case. */
2653 /* NOTE - this does NOT affect us since we don't support sigev_value */
2654 /* yet in the aio context. */
2655 //LP64
2656 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2657 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2658 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2659 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2660 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2661 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2662 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2663 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
55e303ae 2664}
b0d623f7
A
2665
2666/* Similar for 64-bit user process, so that we don't need to satisfy
2667 * the alignment constraints of the original user64_aiocb
2668 */
2669static void
2670do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2671{
2672 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2673 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2674 the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2675 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2676 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2677 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2678
2679 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2680 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2681 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2682 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2683 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2684 my_aiocbp->aio_sigevent.sigev_notify_function;
2685 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2686 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2687}