]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_aio.c
xnu-6153.41.3.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
1 /*
2 * Copyright (c) 2003-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 /*
31 * todo:
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
38 */
39
40
41 /*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
57 #include <sys/user.h>
58
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
61
62 #include <machine/limits.h>
63
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/waitq.h>
67 #include <kern/zalloc.h>
68 #include <kern/task.h>
69 #include <kern/sched_prim.h>
70
71 #include <vm/vm_map.h>
72
73 #include <libkern/OSAtomic.h>
74
75 #include <sys/kdebug.h>
76 #define AIO_work_queued 1
77 #define AIO_worker_wake 2
78 #define AIO_completion_sig 3
79 #define AIO_completion_cleanup_wait 4
80 #define AIO_completion_cleanup_wake 5
81 #define AIO_completion_suspend_wake 6
82 #define AIO_fsync_delay 7
83 #define AIO_cancel 10
84 #define AIO_cancel_async_workq 11
85 #define AIO_cancel_sync_workq 12
86 #define AIO_cancel_activeq 13
87 #define AIO_cancel_doneq 14
88 #define AIO_fsync 20
89 #define AIO_read 30
90 #define AIO_write 40
91 #define AIO_listio 50
92 #define AIO_error 60
93 #define AIO_error_val 61
94 #define AIO_error_activeq 62
95 #define AIO_error_workq 63
96 #define AIO_return 70
97 #define AIO_return_val 71
98 #define AIO_return_activeq 72
99 #define AIO_return_workq 73
100 #define AIO_exec 80
101 #define AIO_exit 90
102 #define AIO_exit_sleep 91
103 #define AIO_close 100
104 #define AIO_close_sleep 101
105 #define AIO_suspend 110
106 #define AIO_suspend_sleep 111
107 #define AIO_worker_thread 120
108
109 #if 0
110 #undef KERNEL_DEBUG
111 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
112 #endif
113
114 /*
115 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
116 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
117 * (proc.aio_activeq) when one of our worker threads start the IO.
118 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
119 * when the IO request completes. The request remains on aio_doneq until
120 * user process calls aio_return or the process exits, either way that is our
121 * trigger to release aio resources.
122 */
123 typedef struct aio_workq {
124 TAILQ_HEAD(, aio_workq_entry) aioq_entries;
125 int aioq_count;
126 lck_mtx_t aioq_mtx;
127 struct waitq aioq_waitq;
128 } *aio_workq_t;
129
130 #define AIO_NUM_WORK_QUEUES 1
131 struct aio_anchor_cb {
132 volatile int32_t aio_inflight_count; /* entries that have been taken from a workq */
133 volatile int32_t aio_done_count; /* entries on all done queues (proc.aio_doneq) */
134 volatile int32_t aio_total_count; /* total extant entries */
135
136 /* Hash table of queues here */
137 int aio_num_workqs;
138 struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
139 };
140 typedef struct aio_anchor_cb aio_anchor_cb;
141
142 struct aio_lio_context {
143 int io_waiter;
144 int io_issued;
145 int io_completed;
146 };
147 typedef struct aio_lio_context aio_lio_context;
148
149
150 /*
151 * Notes on aio sleep / wake channels.
152 * We currently pick a couple fields within the proc structure that will allow
153 * us sleep channels that currently do not collide with any other kernel routines.
154 * At this time, for binary compatibility reasons, we cannot create new proc fields.
155 */
156 #define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
157 #define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
158
159 #define ASSERT_AIO_FROM_PROC(aiop, theproc) \
160 if ((aiop)->procp != (theproc)) { \
161 panic("AIO on a proc list that does not belong to that proc.\n"); \
162 }
163
164 /*
165 * LOCAL PROTOTYPES
166 */
167 static void aio_proc_lock(proc_t procp);
168 static void aio_proc_lock_spin(proc_t procp);
169 static void aio_proc_unlock(proc_t procp);
170 static lck_mtx_t* aio_proc_mutex(proc_t procp);
171 static void aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
172 static void aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
173 static int aio_get_process_count(proc_t procp );
174 static int aio_active_requests_for_process(proc_t procp );
175 static int aio_proc_active_requests_for_file(proc_t procp, int fd);
176 static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp );
177 static boolean_t should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
178
179 static void aio_entry_lock(aio_workq_entry *entryp);
180 static void aio_entry_lock_spin(aio_workq_entry *entryp);
181 static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
182 static lck_mtx_t* aio_entry_mutex(__unused aio_workq_entry *entryp);
183 static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
184 static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
185 static void aio_entry_ref_locked(aio_workq_entry *entryp);
186 static void aio_entry_unref_locked(aio_workq_entry *entryp);
187 static void aio_entry_ref(aio_workq_entry *entryp);
188 static void aio_entry_unref(aio_workq_entry *entryp);
189 static void aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
190 int wait_for_completion, boolean_t disable_notification);
191 static int aio_entry_try_workq_remove(aio_workq_entry *entryp);
192 static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
193 static int aio_free_request(aio_workq_entry *entryp);
194
195 static void aio_workq_init(aio_workq_t wq);
196 static void aio_workq_lock_spin(aio_workq_t wq);
197 static void aio_workq_unlock(aio_workq_t wq);
198 static lck_mtx_t* aio_workq_mutex(aio_workq_t wq);
199
200 static void aio_work_thread( void );
201 static aio_workq_entry *aio_get_some_work( void );
202
203 static int aio_get_all_queues_count( void );
204 static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
205 static int aio_validate( aio_workq_entry *entryp );
206 static int aio_increment_total_count(void);
207 static int aio_decrement_total_count(void);
208
209 static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
210 static void do_aio_completion( aio_workq_entry *entryp );
211 static int do_aio_fsync( aio_workq_entry *entryp );
212 static int do_aio_read( aio_workq_entry *entryp );
213 static int do_aio_write( aio_workq_entry *entryp );
214 static void do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
215 static void do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
216 static int lio_create_entry(proc_t procp,
217 user_addr_t aiocbp,
218 void *group_tag,
219 aio_workq_entry **entrypp );
220 static aio_workq_entry *aio_create_queue_entry(proc_t procp,
221 user_addr_t aiocbp,
222 void *group_tag,
223 int kindOfIO);
224 static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
225 static void free_lio_context(aio_lio_context* context);
226 static void aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
227
228 #define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
229 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
230 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
231
232 /*
233 * EXTERNAL PROTOTYPES
234 */
235
236 /* in ...bsd/kern/sys_generic.c */
237 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
238 user_addr_t bufp, user_size_t nbyte,
239 off_t offset, int flags, user_ssize_t *retval );
240 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
241 user_addr_t bufp, user_size_t nbyte, off_t offset,
242 int flags, user_ssize_t *retval );
243 #if DEBUG
244 static uint32_t lio_contexts_alloced = 0;
245 #endif /* DEBUG */
246
247 /*
248 * aio external global variables.
249 */
250 extern int aio_max_requests; /* AIO_MAX - configurable */
251 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
252 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
253
254
255 /*
256 * aio static variables.
257 */
258 static aio_anchor_cb aio_anchor;
259 static lck_grp_t *aio_proc_lock_grp;
260 static lck_grp_t *aio_entry_lock_grp;
261 static lck_grp_t *aio_queue_lock_grp;
262 static lck_attr_t *aio_lock_attr;
263 static lck_grp_attr_t *aio_lock_grp_attr;
264 static struct zone *aio_workq_zonep;
265 static lck_mtx_t aio_entry_mtx;
266 static lck_mtx_t aio_proc_mtx;
267
268 static void
269 aio_entry_lock(__unused aio_workq_entry *entryp)
270 {
271 lck_mtx_lock(&aio_entry_mtx);
272 }
273
274 static void
275 aio_entry_lock_spin(__unused aio_workq_entry *entryp)
276 {
277 lck_mtx_lock_spin(&aio_entry_mtx);
278 }
279
280 static void
281 aio_entry_unlock(__unused aio_workq_entry *entryp)
282 {
283 lck_mtx_unlock(&aio_entry_mtx);
284 }
285
286 /* Hash */
287 static aio_workq_t
288 aio_entry_workq(__unused aio_workq_entry *entryp)
289 {
290 return &aio_anchor.aio_async_workqs[0];
291 }
292
293 static lck_mtx_t*
294 aio_entry_mutex(__unused aio_workq_entry *entryp)
295 {
296 return &aio_entry_mtx;
297 }
298
299 static void
300 aio_workq_init(aio_workq_t wq)
301 {
302 TAILQ_INIT(&wq->aioq_entries);
303 wq->aioq_count = 0;
304 lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
305 waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
306 }
307
308
309 /*
310 * Can be passed a queue which is locked spin.
311 */
312 static void
313 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
314 {
315 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
316
317 if (entryp->aio_workq_link.tqe_prev == NULL) {
318 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
319 }
320
321 TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
322 queue->aioq_count--;
323 entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
324
325 if (queue->aioq_count < 0) {
326 panic("Negative count on a queue.\n");
327 }
328 }
329
330 static void
331 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
332 {
333 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
334
335 TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
336 if (queue->aioq_count < 0) {
337 panic("Negative count on a queue.\n");
338 }
339 queue->aioq_count++;
340 }
341
342 static void
343 aio_proc_lock(proc_t procp)
344 {
345 lck_mtx_lock(aio_proc_mutex(procp));
346 }
347
348 static void
349 aio_proc_lock_spin(proc_t procp)
350 {
351 lck_mtx_lock_spin(aio_proc_mutex(procp));
352 }
353
354 static void
355 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
356 {
357 ASSERT_AIO_PROC_LOCK_OWNED(procp);
358
359 TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
360 TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
361 procp->p_aio_active_count--;
362 OSIncrementAtomic(&aio_anchor.aio_done_count);
363 }
364
365 static void
366 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
367 {
368 TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
369 OSDecrementAtomic(&aio_anchor.aio_done_count);
370 aio_decrement_total_count();
371 procp->p_aio_total_count--;
372 }
373
374 static void
375 aio_proc_unlock(proc_t procp)
376 {
377 lck_mtx_unlock(aio_proc_mutex(procp));
378 }
379
380 static lck_mtx_t*
381 aio_proc_mutex(proc_t procp)
382 {
383 return &procp->p_mlock;
384 }
385
386 static void
387 aio_entry_ref_locked(aio_workq_entry *entryp)
388 {
389 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
390
391 if (entryp->aio_refcount < 0) {
392 panic("AIO workq entry with a negative refcount.\n");
393 }
394 entryp->aio_refcount++;
395 }
396
397
398 /* Return 1 if you've freed it */
399 static void
400 aio_entry_unref_locked(aio_workq_entry *entryp)
401 {
402 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
403
404 entryp->aio_refcount--;
405 if (entryp->aio_refcount < 0) {
406 panic("AIO workq entry with a negative refcount.\n");
407 }
408 }
409
410 static void
411 aio_entry_ref(aio_workq_entry *entryp)
412 {
413 aio_entry_lock_spin(entryp);
414 aio_entry_ref_locked(entryp);
415 aio_entry_unlock(entryp);
416 }
417 static void
418 aio_entry_unref(aio_workq_entry *entryp)
419 {
420 aio_entry_lock_spin(entryp);
421 aio_entry_unref_locked(entryp);
422
423 if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
424 aio_entry_unlock(entryp);
425 aio_free_request(entryp);
426 } else {
427 aio_entry_unlock(entryp);
428 }
429
430 return;
431 }
432
433 static void
434 aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
435 {
436 aio_entry_lock_spin(entryp);
437
438 if (cancelled) {
439 aio_entry_ref_locked(entryp);
440 entryp->errorval = ECANCELED;
441 entryp->returnval = -1;
442 }
443
444 if (wait_for_completion) {
445 entryp->flags |= wait_for_completion; /* flag for special completion processing */
446 }
447
448 if (disable_notification) {
449 entryp->flags |= AIO_DISABLE; /* Don't want a signal */
450 }
451
452 aio_entry_unlock(entryp);
453 }
454
455 static int
456 aio_entry_try_workq_remove(aio_workq_entry *entryp)
457 {
458 /* Can only be cancelled if it's still on a work queue */
459 if (entryp->aio_workq_link.tqe_prev != NULL) {
460 aio_workq_t queue;
461
462 /* Will have to check again under the lock */
463 queue = aio_entry_workq(entryp);
464 aio_workq_lock_spin(queue);
465 if (entryp->aio_workq_link.tqe_prev != NULL) {
466 aio_workq_remove_entry_locked(queue, entryp);
467 aio_workq_unlock(queue);
468 return 1;
469 } else {
470 aio_workq_unlock(queue);
471 }
472 }
473
474 return 0;
475 }
476
477 static void
478 aio_workq_lock_spin(aio_workq_t wq)
479 {
480 lck_mtx_lock_spin(aio_workq_mutex(wq));
481 }
482
483 static void
484 aio_workq_unlock(aio_workq_t wq)
485 {
486 lck_mtx_unlock(aio_workq_mutex(wq));
487 }
488
489 static lck_mtx_t*
490 aio_workq_mutex(aio_workq_t wq)
491 {
492 return &wq->aioq_mtx;
493 }
494
495 /*
496 * aio_cancel - attempt to cancel one or more async IO requests currently
497 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
498 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
499 * is NULL then all outstanding async IO request for the given file
500 * descriptor are cancelled (if possible).
501 */
502 int
503 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
504 {
505 struct user_aiocb my_aiocb;
506 int result;
507
508 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
509 (int)p, (int)uap->aiocbp, 0, 0, 0 );
510
511 /* quick check to see if there are any async IO requests queued up */
512 if (aio_get_all_queues_count() < 1) {
513 result = 0;
514 *retval = AIO_ALLDONE;
515 goto ExitRoutine;
516 }
517
518 *retval = -1;
519 if (uap->aiocbp != USER_ADDR_NULL) {
520 if (proc_is64bit(p)) {
521 struct user64_aiocb aiocb64;
522
523 result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64));
524 if (result == 0) {
525 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
526 }
527 } else {
528 struct user32_aiocb aiocb32;
529
530 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32));
531 if (result == 0) {
532 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
533 }
534 }
535
536 if (result != 0) {
537 result = EAGAIN;
538 goto ExitRoutine;
539 }
540
541 /* NOTE - POSIX standard says a mismatch between the file */
542 /* descriptor passed in and the file descriptor embedded in */
543 /* the aiocb causes unspecified results. We return EBADF in */
544 /* that situation. */
545 if (uap->fd != my_aiocb.aio_fildes) {
546 result = EBADF;
547 goto ExitRoutine;
548 }
549 }
550
551 aio_proc_lock(p);
552 result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
553 ASSERT_AIO_PROC_LOCK_OWNED(p);
554 aio_proc_unlock(p);
555
556 if (result != -1) {
557 *retval = result;
558 result = 0;
559 goto ExitRoutine;
560 }
561
562 result = EBADF;
563
564 ExitRoutine:
565 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
566 (int)p, (int)uap->aiocbp, result, 0, 0 );
567
568 return result;
569 } /* aio_cancel */
570
571
572 /*
573 * _aio_close - internal function used to clean up async IO requests for
574 * a file descriptor that is closing.
575 * THIS MAY BLOCK.
576 */
577 __private_extern__ void
578 _aio_close(proc_t p, int fd )
579 {
580 int error;
581
582 /* quick check to see if there are any async IO requests queued up */
583 if (aio_get_all_queues_count() < 1) {
584 return;
585 }
586
587 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
588 (int)p, fd, 0, 0, 0 );
589
590 /* cancel all async IO requests on our todo queues for this file descriptor */
591 aio_proc_lock(p);
592 error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
593 ASSERT_AIO_PROC_LOCK_OWNED(p);
594 if (error == AIO_NOTCANCELED) {
595 /*
596 * AIO_NOTCANCELED is returned when we find an aio request for this process
597 * and file descriptor on the active async IO queue. Active requests cannot
598 * be cancelled so we must wait for them to complete. We will get a special
599 * wake up call on our channel used to sleep for ALL active requests to
600 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
601 * when we must wait for all active aio requests.
602 */
603
604 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
605 (int)p, fd, 0, 0, 0 );
606
607 while (aio_proc_active_requests_for_file(p, fd) > 0) {
608 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 );
609 }
610 }
611
612 aio_proc_unlock(p);
613
614 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
615 (int)p, fd, 0, 0, 0 );
616
617 return;
618 } /* _aio_close */
619
620
621 /*
622 * aio_error - return the error status associated with the async IO
623 * request referred to by uap->aiocbp. The error status is the errno
624 * value that would be set by the corresponding IO request (read, wrtie,
625 * fdatasync, or sync).
626 */
627 int
628 aio_error(proc_t p, struct aio_error_args *uap, int *retval )
629 {
630 aio_workq_entry *entryp;
631 int error;
632
633 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
634 (int)p, (int)uap->aiocbp, 0, 0, 0 );
635
636 /* see if there are any aios to check */
637 if (aio_get_all_queues_count() < 1) {
638 return EINVAL;
639 }
640
641 aio_proc_lock(p);
642
643 /* look for a match on our queue of async IO requests that have completed */
644 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
645 if (entryp->uaiocbp == uap->aiocbp) {
646 ASSERT_AIO_FROM_PROC(entryp, p);
647
648 aio_entry_lock_spin(entryp);
649 *retval = entryp->errorval;
650 error = 0;
651 aio_entry_unlock(entryp);
652 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
653 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
654 goto ExitRoutine;
655 }
656 }
657
658 /* look for a match on our queue of active async IO requests */
659 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
660 if (entryp->uaiocbp == uap->aiocbp) {
661 ASSERT_AIO_FROM_PROC(entryp, p);
662 *retval = EINPROGRESS;
663 error = 0;
664 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
665 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
666 goto ExitRoutine;
667 }
668 }
669
670 error = EINVAL;
671
672 ExitRoutine:
673 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
674 (int)p, (int)uap->aiocbp, error, 0, 0 );
675 aio_proc_unlock(p);
676
677 return error;
678 } /* aio_error */
679
680
681 /*
682 * aio_fsync - asynchronously force all IO operations associated
683 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
684 * queued at the time of the call to the synchronized completion state.
685 * NOTE - we do not support op O_DSYNC at this point since we do not support the
686 * fdatasync() call.
687 */
688 int
689 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
690 {
691 int error;
692 int fsync_kind;
693
694 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
695 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
696
697 *retval = 0;
698 /* 0 := O_SYNC for binary backward compatibility with Panther */
699 if (uap->op == O_SYNC || uap->op == 0) {
700 fsync_kind = AIO_FSYNC;
701 } else if (uap->op == O_DSYNC) {
702 fsync_kind = AIO_DSYNC;
703 } else {
704 *retval = -1;
705 error = EINVAL;
706 goto ExitRoutine;
707 }
708
709 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
710 if (error != 0) {
711 *retval = -1;
712 }
713
714 ExitRoutine:
715 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
716 (int)p, (int)uap->aiocbp, error, 0, 0 );
717
718 return error;
719 } /* aio_fsync */
720
721
722 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
723 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
724 * (uap->aiocbp->aio_buf).
725 */
726 int
727 aio_read(proc_t p, struct aio_read_args *uap, int *retval )
728 {
729 int error;
730
731 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
732 (int)p, (int)uap->aiocbp, 0, 0, 0 );
733
734 *retval = 0;
735
736 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
737 if (error != 0) {
738 *retval = -1;
739 }
740
741 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
742 (int)p, (int)uap->aiocbp, error, 0, 0 );
743
744 return error;
745 } /* aio_read */
746
747
748 /*
749 * aio_return - return the return status associated with the async IO
750 * request referred to by uap->aiocbp. The return status is the value
751 * that would be returned by corresponding IO request (read, write,
752 * fdatasync, or sync). This is where we release kernel resources
753 * held for async IO call associated with the given aiocb pointer.
754 */
755 int
756 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
757 {
758 aio_workq_entry *entryp;
759 int error;
760 boolean_t proc_lock_held = FALSE;
761
762 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
763 (int)p, (int)uap->aiocbp, 0, 0, 0 );
764
765 /* See if there are any entries to check */
766 if (aio_get_all_queues_count() < 1) {
767 error = EINVAL;
768 goto ExitRoutine;
769 }
770
771 aio_proc_lock(p);
772 proc_lock_held = TRUE;
773 *retval = 0;
774
775 /* look for a match on our queue of async IO requests that have completed */
776 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
777 ASSERT_AIO_FROM_PROC(entryp, p);
778 if (entryp->uaiocbp == uap->aiocbp) {
779 /* Done and valid for aio_return(), pull it off the list */
780 aio_proc_remove_done_locked(p, entryp);
781
782 /* Drop the proc lock, but keep the entry locked */
783 aio_entry_lock(entryp);
784 aio_proc_unlock(p);
785 proc_lock_held = FALSE;
786
787 *retval = entryp->returnval;
788 error = 0;
789
790 /* No references and off all lists, safe to free */
791 if (entryp->aio_refcount == 0) {
792 aio_entry_unlock(entryp);
793 aio_free_request(entryp);
794 } else {
795 /* Whoever has the refcount will have to free it */
796 entryp->flags |= AIO_DO_FREE;
797 aio_entry_unlock(entryp);
798 }
799
800
801 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
802 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
803 goto ExitRoutine;
804 }
805 }
806
807 /* look for a match on our queue of active async IO requests */
808 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
809 ASSERT_AIO_FROM_PROC(entryp, p);
810 if (entryp->uaiocbp == uap->aiocbp) {
811 error = EINPROGRESS;
812 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
813 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
814 goto ExitRoutine;
815 }
816 }
817
818 error = EINVAL;
819
820 ExitRoutine:
821 if (proc_lock_held) {
822 aio_proc_unlock(p);
823 }
824 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
825 (int)p, (int)uap->aiocbp, error, 0, 0 );
826
827 return error;
828 } /* aio_return */
829
830
831 /*
832 * _aio_exec - internal function used to clean up async IO requests for
833 * a process that is going away due to exec(). We cancel any async IOs
834 * we can and wait for those already active. We also disable signaling
835 * for cancelled or active aio requests that complete.
836 * This routine MAY block!
837 */
838 __private_extern__ void
839 _aio_exec(proc_t p )
840 {
841 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
842 (int)p, 0, 0, 0, 0 );
843
844 _aio_exit( p );
845
846 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
847 (int)p, 0, 0, 0, 0 );
848
849 return;
850 } /* _aio_exec */
851
852
853 /*
854 * _aio_exit - internal function used to clean up async IO requests for
855 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
856 * we can and wait for those already active. We also disable signaling
857 * for cancelled or active aio requests that complete. This routine MAY block!
858 */
859 __private_extern__ void
860 _aio_exit(proc_t p )
861 {
862 int error;
863 aio_workq_entry *entryp;
864
865
866 /* quick check to see if there are any async IO requests queued up */
867 if (aio_get_all_queues_count() < 1) {
868 return;
869 }
870
871 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
872 (int)p, 0, 0, 0, 0 );
873
874 aio_proc_lock(p);
875
876 /*
877 * cancel async IO requests on the todo work queue and wait for those
878 * already active to complete.
879 */
880 error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
881 ASSERT_AIO_PROC_LOCK_OWNED(p);
882 if (error == AIO_NOTCANCELED) {
883 /*
884 * AIO_NOTCANCELED is returned when we find an aio request for this process
885 * on the active async IO queue. Active requests cannot be cancelled so we
886 * must wait for them to complete. We will get a special wake up call on
887 * our channel used to sleep for ALL active requests to complete. This sleep
888 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
889 * active aio requests.
890 */
891
892 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
893 (int)p, 0, 0, 0, 0 );
894
895 while (p->p_aio_active_count != 0) {
896 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
897 }
898 }
899
900 if (p->p_aio_active_count != 0) {
901 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
902 }
903
904 /* release all aio resources used by this process */
905 entryp = TAILQ_FIRST( &p->p_aio_doneq );
906 while (entryp != NULL) {
907 ASSERT_AIO_FROM_PROC(entryp, p);
908 aio_workq_entry *next_entryp;
909
910 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
911 aio_proc_remove_done_locked(p, entryp);
912
913 /* we cannot free requests that are still completing */
914 aio_entry_lock_spin(entryp);
915 if (entryp->aio_refcount == 0) {
916 aio_proc_unlock(p);
917 aio_entry_unlock(entryp);
918 aio_free_request(entryp);
919
920 /* need to start over since aio_doneq may have been */
921 /* changed while we were away. */
922 aio_proc_lock(p);
923 entryp = TAILQ_FIRST( &p->p_aio_doneq );
924 continue;
925 } else {
926 /* whoever has the reference will have to do the free */
927 entryp->flags |= AIO_DO_FREE;
928 }
929
930 aio_entry_unlock(entryp);
931 entryp = next_entryp;
932 }
933
934 aio_proc_unlock(p);
935
936 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
937 (int)p, 0, 0, 0, 0 );
938 return;
939 } /* _aio_exit */
940
941
942 static boolean_t
943 should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
944 {
945 if ((aiocbp == USER_ADDR_NULL && fd == 0) ||
946 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
947 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes)) {
948 return TRUE;
949 }
950
951 return FALSE;
952 }
953
954 /*
955 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
956 * aio_cancel, close, and at exit.
957 * There are three modes of operation: 1) cancel all async IOs for a process -
958 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
959 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
960 * aiocbp.
961 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
962 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
963 * target async IO requests, and AIO_ALLDONE if all target async IO requests
964 * were already complete.
965 * WARNING - do not deference aiocbp in this routine, it may point to user
966 * land data that has not been copied in (when called from aio_cancel() )
967 *
968 * Called with proc locked, and returns the same way.
969 */
970 static int
971 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
972 int wait_for_completion, boolean_t disable_notification )
973 {
974 ASSERT_AIO_PROC_LOCK_OWNED(p);
975
976 aio_workq_entry *entryp;
977 int result;
978
979 result = -1;
980
981 /* look for a match on our queue of async todo work. */
982 entryp = TAILQ_FIRST(&p->p_aio_activeq);
983 while (entryp != NULL) {
984 ASSERT_AIO_FROM_PROC(entryp, p);
985 aio_workq_entry *next_entryp;
986
987 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
988 if (!should_cancel(entryp, aiocbp, fd)) {
989 entryp = next_entryp;
990 continue;
991 }
992
993 /* Can only be cancelled if it's still on a work queue */
994 if (aio_entry_try_workq_remove(entryp) != 0) {
995 /* Have removed from workq. Update entry state and take a ref */
996 aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
997
998 /* Put on the proc done queue and update counts, then unlock the proc */
999 aio_proc_move_done_locked(p, entryp);
1000 aio_proc_unlock(p);
1001
1002 /* Now it's officially cancelled. Do the completion */
1003 result = AIO_CANCELED;
1004 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1005 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1006 do_aio_completion(entryp);
1007
1008 /* This will free if the aio_return() has already happened ... */
1009 aio_entry_unref(entryp);
1010 aio_proc_lock(p);
1011
1012 if (aiocbp != USER_ADDR_NULL) {
1013 return result;
1014 }
1015
1016 /*
1017 * Restart from the head of the proc active queue since it
1018 * may have been changed while we were away doing completion
1019 * processing.
1020 *
1021 * Note that if we found an uncancellable AIO before, we will
1022 * either find it again or discover that it's been completed,
1023 * so resetting the result will not cause us to return success
1024 * despite outstanding AIOs.
1025 */
1026 entryp = TAILQ_FIRST(&p->p_aio_activeq);
1027 result = -1; /* As if beginning anew */
1028 } else {
1029 /*
1030 * It's been taken off the active queue already, i.e. is in flight.
1031 * All we can do is ask for notification.
1032 */
1033 result = AIO_NOTCANCELED;
1034
1035 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
1036 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1037
1038 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1039 aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
1040
1041 if (aiocbp != USER_ADDR_NULL) {
1042 return result;
1043 }
1044 entryp = next_entryp;
1045 }
1046 } /* while... */
1047
1048 /*
1049 * if we didn't find any matches on the todo or active queues then look for a
1050 * match on our queue of async IO requests that have completed and if found
1051 * return AIO_ALLDONE result.
1052 *
1053 * Proc AIO lock is still held.
1054 */
1055 if (result == -1) {
1056 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1057 ASSERT_AIO_FROM_PROC(entryp, p);
1058 if (should_cancel(entryp, aiocbp, fd)) {
1059 result = AIO_ALLDONE;
1060 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
1061 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1062
1063 if (aiocbp != USER_ADDR_NULL) {
1064 return result;
1065 }
1066 }
1067 }
1068 }
1069
1070 return result;
1071 }
1072 /* do_aio_cancel_locked */
1073
1074
1075 /*
1076 * aio_suspend - suspend the calling thread until at least one of the async
1077 * IO operations referenced by uap->aiocblist has completed, until a signal
1078 * interrupts the function, or uap->timeoutp time interval (optional) has
1079 * passed.
1080 * Returns 0 if one or more async IOs have completed else -1 and errno is
1081 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1082 * woke us up.
1083 */
1084 int
1085 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1086 {
1087 __pthread_testcancel(1);
1088 return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
1089 }
1090
1091
1092 int
1093 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
1094 {
1095 int error;
1096 int i, count;
1097 uint64_t abstime;
1098 struct user_timespec ts;
1099 aio_workq_entry *entryp;
1100 user_addr_t *aiocbpp;
1101
1102 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1103 (int)p, uap->nent, 0, 0, 0 );
1104
1105 *retval = -1;
1106 abstime = 0;
1107 aiocbpp = NULL;
1108
1109 count = aio_get_all_queues_count();
1110 if (count < 1) {
1111 error = EINVAL;
1112 goto ExitThisRoutine;
1113 }
1114
1115 if (uap->nent < 1 || uap->nent > aio_max_requests_per_process) {
1116 error = EINVAL;
1117 goto ExitThisRoutine;
1118 }
1119
1120 if (uap->timeoutp != USER_ADDR_NULL) {
1121 if (proc_is64bit(p)) {
1122 struct user64_timespec temp;
1123 error = copyin( uap->timeoutp, &temp, sizeof(temp));
1124 if (error == 0) {
1125 ts.tv_sec = temp.tv_sec;
1126 ts.tv_nsec = temp.tv_nsec;
1127 }
1128 } else {
1129 struct user32_timespec temp;
1130 error = copyin( uap->timeoutp, &temp, sizeof(temp));
1131 if (error == 0) {
1132 ts.tv_sec = temp.tv_sec;
1133 ts.tv_nsec = temp.tv_nsec;
1134 }
1135 }
1136 if (error != 0) {
1137 error = EAGAIN;
1138 goto ExitThisRoutine;
1139 }
1140
1141 if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
1142 error = EINVAL;
1143 goto ExitThisRoutine;
1144 }
1145
1146 nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1147 &abstime );
1148 clock_absolutetime_interval_to_deadline( abstime, &abstime );
1149 }
1150
1151 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1152 if (aiocbpp == NULL) {
1153 error = EAGAIN;
1154 goto ExitThisRoutine;
1155 }
1156
1157 /* check list of aio requests to see if any have completed */
1158 check_for_our_aiocbp:
1159 aio_proc_lock_spin(p);
1160 for (i = 0; i < uap->nent; i++) {
1161 user_addr_t aiocbp;
1162
1163 /* NULL elements are legal so check for 'em */
1164 aiocbp = *(aiocbpp + i);
1165 if (aiocbp == USER_ADDR_NULL) {
1166 continue;
1167 }
1168
1169 /* return immediately if any aio request in the list is done */
1170 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1171 ASSERT_AIO_FROM_PROC(entryp, p);
1172 if (entryp->uaiocbp == aiocbp) {
1173 aio_proc_unlock(p);
1174 *retval = 0;
1175 error = 0;
1176 goto ExitThisRoutine;
1177 }
1178 }
1179 } /* for ( ; i < uap->nent; ) */
1180
1181 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1182 (int)p, uap->nent, 0, 0, 0 );
1183
1184 /*
1185 * wait for an async IO to complete or a signal fires or timeout expires.
1186 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1187 * interrupts us. If an async IO completes before a signal fires or our
1188 * timeout expires, we get a wakeup call from aio_work_thread().
1189 */
1190
1191 error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
1192 if (error == 0) {
1193 /*
1194 * got our wakeup call from aio_work_thread().
1195 * Since we can get a wakeup on this channel from another thread in the
1196 * same process we head back up to make sure this is for the correct aiocbp.
1197 * If it is the correct aiocbp we will return from where we do the check
1198 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1199 * else we will fall out and just sleep again.
1200 */
1201 goto check_for_our_aiocbp;
1202 } else if (error == EWOULDBLOCK) {
1203 /* our timeout expired */
1204 error = EAGAIN;
1205 } else {
1206 /* we were interrupted */
1207 error = EINTR;
1208 }
1209
1210 ExitThisRoutine:
1211 if (aiocbpp != NULL) {
1212 FREE( aiocbpp, M_TEMP );
1213 }
1214
1215 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1216 (int)p, uap->nent, error, 0, 0 );
1217
1218 return error;
1219 } /* aio_suspend */
1220
1221
1222 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1223 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1224 * (uap->aiocbp->aio_buf).
1225 */
1226
1227 int
1228 aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1229 {
1230 int error;
1231
1232 *retval = 0;
1233
1234 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1235 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1236
1237 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1238 if (error != 0) {
1239 *retval = -1;
1240 }
1241
1242 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1243 (int)p, (int)uap->aiocbp, error, 0, 0 );
1244
1245 return error;
1246 } /* aio_write */
1247
1248
1249 static user_addr_t *
1250 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
1251 {
1252 user_addr_t *aiocbpp;
1253 int i, result;
1254
1255 /* we reserve enough space for largest possible pointer size */
1256 MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1257 if (aiocbpp == NULL) {
1258 goto err;
1259 }
1260
1261 /* copyin our aiocb pointers from list */
1262 result = copyin( aiocblist, aiocbpp,
1263 proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1264 : (nent * sizeof(user32_addr_t)));
1265 if (result) {
1266 FREE( aiocbpp, M_TEMP );
1267 aiocbpp = NULL;
1268 goto err;
1269 }
1270
1271 /*
1272 * We depend on a list of user_addr_t's so we need to
1273 * munge and expand when these pointers came from a
1274 * 32-bit process
1275 */
1276 if (!proc_is64bit(procp)) {
1277 /* copy from last to first to deal with overlap */
1278 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1279 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1280
1281 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1282 *my_addrp = (user_addr_t) (*my_ptrp);
1283 }
1284 }
1285
1286 err:
1287 return aiocbpp;
1288 }
1289
1290
1291 static int
1292 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1293 {
1294 int result = 0;
1295
1296 if (sigp == USER_ADDR_NULL) {
1297 goto out;
1298 }
1299
1300 /*
1301 * We need to munge aio_sigevent since it contains pointers.
1302 * Since we do not know if sigev_value is an int or a ptr we do
1303 * NOT cast the ptr to a user_addr_t. This means if we send
1304 * this info back to user space we need to remember sigev_value
1305 * was not expanded for the 32-bit case.
1306 *
1307 * Notes: This does NOT affect us since we don't support
1308 * sigev_value yet in the aio context.
1309 */
1310 if (proc_is64bit(procp)) {
1311 struct user64_sigevent sigevent64;
1312
1313 result = copyin( sigp, &sigevent64, sizeof(sigevent64));
1314 if (result == 0) {
1315 sigev->sigev_notify = sigevent64.sigev_notify;
1316 sigev->sigev_signo = sigevent64.sigev_signo;
1317 sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1318 sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1319 sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1320 }
1321 } else {
1322 struct user32_sigevent sigevent32;
1323
1324 result = copyin( sigp, &sigevent32, sizeof(sigevent32));
1325 if (result == 0) {
1326 sigev->sigev_notify = sigevent32.sigev_notify;
1327 sigev->sigev_signo = sigevent32.sigev_signo;
1328 sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1329 sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1330 sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1331 }
1332 }
1333
1334 if (result != 0) {
1335 result = EAGAIN;
1336 }
1337
1338 out:
1339 return result;
1340 }
1341
1342 /*
1343 * validate user_sigevent. at this point we only support
1344 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
1345 * sigev_value, sigev_notify_function, and sigev_notify_attributes
1346 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
1347 * with no [RTS] (RalTime Signal) option group support.
1348 */
1349 static int
1350 aio_sigev_validate( const struct user_sigevent *sigev )
1351 {
1352 switch (sigev->sigev_notify) {
1353 case SIGEV_SIGNAL:
1354 {
1355 int signum;
1356
1357 /* make sure we have a valid signal number */
1358 signum = sigev->sigev_signo;
1359 if (signum <= 0 || signum >= NSIG ||
1360 signum == SIGKILL || signum == SIGSTOP) {
1361 return EINVAL;
1362 }
1363 }
1364 break;
1365
1366 case SIGEV_NONE:
1367 break;
1368
1369 case SIGEV_THREAD:
1370 /* Unsupported [RTS] */
1371
1372 default:
1373 return EINVAL;
1374 }
1375
1376 return 0;
1377 }
1378
1379
1380 /*
1381 * aio_enqueue_work
1382 *
1383 * Queue up the entry on the aio asynchronous work queue in priority order
1384 * based on the relative priority of the request. We calculate the relative
1385 * priority using the nice value of the caller and the value
1386 *
1387 * Parameters: procp Process queueing the I/O
1388 * entryp The work queue entry being queued
1389 *
1390 * Returns: (void) No failure modes
1391 *
1392 * Notes: This function is used for both lio_listio and aio
1393 *
1394 * XXX: At some point, we may have to consider thread priority
1395 * rather than process priority, but we don't maintain the
1396 * adjusted priority for threads the POSIX way.
1397 *
1398 *
1399 * Called with proc locked.
1400 */
1401 static void
1402 aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1403 {
1404 #if 0
1405 aio_workq_entry *my_entryp; /* used for insertion sort */
1406 #endif /* 0 */
1407 aio_workq_t queue = aio_entry_workq(entryp);
1408
1409 if (proc_locked == 0) {
1410 aio_proc_lock(procp);
1411 }
1412
1413 ASSERT_AIO_PROC_LOCK_OWNED(procp);
1414
1415 /* Onto proc queue */
1416 TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
1417 procp->p_aio_active_count++;
1418 procp->p_aio_total_count++;
1419
1420 /* And work queue */
1421 aio_workq_lock_spin(queue);
1422 aio_workq_add_entry_locked(queue, entryp);
1423 waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1424 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
1425 aio_workq_unlock(queue);
1426
1427 if (proc_locked == 0) {
1428 aio_proc_unlock(procp);
1429 }
1430
1431 #if 0
1432 /*
1433 * Procedure:
1434 *
1435 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1436 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1437 * which is [0..39], with 0 not being used. In nice values, the
1438 * lower the nice value, the higher the priority.
1439 * (3) The normalized scheduling prioritiy is the highest nice value
1440 * minus the current nice value. In I/O scheduling priority, the
1441 * higher the value the lower the priority, so it is the inverse
1442 * of the nice value (the higher the number, the higher the I/O
1443 * priority).
1444 * (4) From the normalized scheduling priority, we subtract the
1445 * request priority to get the request priority value number;
1446 * this means that requests are only capable of depressing their
1447 * priority relative to other requests,
1448 */
1449 entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1450
1451 /* only premit depressing the priority */
1452 if (entryp->aiocb.aio_reqprio < 0) {
1453 entryp->aiocb.aio_reqprio = 0;
1454 }
1455 if (entryp->aiocb.aio_reqprio > 0) {
1456 entryp->priority -= entryp->aiocb.aio_reqprio;
1457 if (entryp->priority < 0) {
1458 entryp->priority = 0;
1459 }
1460 }
1461
1462 /* Insertion sort the entry; lowest ->priority to highest */
1463 TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1464 if (entryp->priority <= my_entryp->priority) {
1465 TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1466 break;
1467 }
1468 }
1469 if (my_entryp == NULL) {
1470 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1471 }
1472 #endif /* 0 */
1473 }
1474
1475
1476 /*
1477 * lio_listio - initiate a list of IO requests. We process the list of
1478 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1479 * (mode == LIO_NOWAIT).
1480 *
1481 * The caller gets error and return status for each aiocb in the list
1482 * via aio_error and aio_return. We must keep completed requests until
1483 * released by the aio_return call.
1484 */
1485 int
1486 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1487 {
1488 int i;
1489 int call_result;
1490 int result;
1491 int old_count;
1492 aio_workq_entry **entryp_listp;
1493 user_addr_t *aiocbpp;
1494 struct user_sigevent aiosigev;
1495 aio_lio_context *lio_context;
1496 boolean_t free_context = FALSE;
1497 uint32_t *paio_offset;
1498 uint32_t *paio_nbytes;
1499
1500 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1501 (int)p, uap->nent, uap->mode, 0, 0 );
1502
1503 entryp_listp = NULL;
1504 lio_context = NULL;
1505 aiocbpp = NULL;
1506 call_result = -1;
1507 *retval = -1;
1508 if (!(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT)) {
1509 call_result = EINVAL;
1510 goto ExitRoutine;
1511 }
1512
1513 if (uap->nent < 1 || uap->nent > AIO_LISTIO_MAX) {
1514 call_result = EINVAL;
1515 goto ExitRoutine;
1516 }
1517
1518 /*
1519 * allocate a list of aio_workq_entry pointers that we will use
1520 * to queue up all our requests at once while holding our lock.
1521 */
1522 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1523 if (entryp_listp == NULL) {
1524 call_result = EAGAIN;
1525 goto ExitRoutine;
1526 }
1527
1528 MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1529 if (lio_context == NULL) {
1530 call_result = EAGAIN;
1531 goto ExitRoutine;
1532 }
1533
1534 #if DEBUG
1535 OSIncrementAtomic(&lio_contexts_alloced);
1536 #endif /* DEBUG */
1537
1538 free_context = TRUE;
1539 bzero(lio_context, sizeof(aio_lio_context));
1540
1541 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1542 if (aiocbpp == NULL) {
1543 call_result = EAGAIN;
1544 goto ExitRoutine;
1545 }
1546
1547 /*
1548 * Use sigevent passed in to lio_listio for each of our calls, but
1549 * only do completion notification after the last request completes.
1550 */
1551 bzero(&aiosigev, sizeof(aiosigev));
1552 /* Only copy in an sigev if the user supplied one */
1553 if (uap->sigp != USER_ADDR_NULL) {
1554 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1555 if (call_result) {
1556 goto ExitRoutine;
1557 }
1558 call_result = aio_sigev_validate(&aiosigev);
1559 if (call_result) {
1560 goto ExitRoutine;
1561 }
1562 }
1563
1564 /* process list of aio requests */
1565 free_context = FALSE;
1566 lio_context->io_issued = uap->nent;
1567 lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
1568 for (i = 0; i < uap->nent; i++) {
1569 user_addr_t my_aiocbp;
1570 aio_workq_entry *entryp;
1571
1572 *(entryp_listp + i) = NULL;
1573 my_aiocbp = *(aiocbpp + i);
1574
1575 /* NULL elements are legal so check for 'em */
1576 if (my_aiocbp == USER_ADDR_NULL) {
1577 aio_proc_lock_spin(p);
1578 lio_context->io_issued--;
1579 aio_proc_unlock(p);
1580 continue;
1581 }
1582
1583 /*
1584 * We use lio_context to mark IO requests for delayed completion
1585 * processing which means we wait until all IO requests in the
1586 * group have completed before we either return to the caller
1587 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1588 *
1589 * We use the address of the lio_context for this, since it is
1590 * unique in the address space.
1591 */
1592 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i));
1593 if (result != 0 && call_result == -1) {
1594 call_result = result;
1595 }
1596
1597 /* NULL elements are legal so check for 'em */
1598 entryp = *(entryp_listp + i);
1599 if (entryp == NULL) {
1600 aio_proc_lock_spin(p);
1601 lio_context->io_issued--;
1602 aio_proc_unlock(p);
1603 continue;
1604 }
1605
1606 if (uap->mode == LIO_NOWAIT) {
1607 /* Set signal hander, if any */
1608 entryp->aiocb.aio_sigevent = aiosigev;
1609 } else {
1610 /* flag that this thread blocks pending completion */
1611 entryp->flags |= AIO_LIO_NOTIFY;
1612 }
1613
1614 /* check our aio limits to throttle bad or rude user land behavior */
1615 old_count = aio_increment_total_count();
1616
1617 aio_proc_lock_spin(p);
1618 if (old_count >= aio_max_requests ||
1619 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1620 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE) {
1621 lio_context->io_issued--;
1622 aio_proc_unlock(p);
1623
1624 aio_decrement_total_count();
1625
1626 if (call_result == -1) {
1627 call_result = EAGAIN;
1628 }
1629 aio_free_request(entryp);
1630 entryp_listp[i] = NULL;
1631 continue;
1632 }
1633
1634 lck_mtx_convert_spin(aio_proc_mutex(p));
1635 aio_enqueue_work(p, entryp, 1);
1636 aio_proc_unlock(p);
1637
1638 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_START,
1639 (int)p, (int)entryp->uaiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
1640 paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
1641 paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
1642 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_END,
1643 paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
1644 paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
1645 0 );
1646 }
1647
1648 aio_proc_lock_spin(p);
1649 switch (uap->mode) {
1650 case LIO_WAIT:
1651 while (lio_context->io_completed < lio_context->io_issued) {
1652 result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
1653
1654 /* If we were interrupted, fail out (even if all finished) */
1655 if (result != 0) {
1656 call_result = EINTR;
1657 lio_context->io_waiter = 0;
1658 break;
1659 }
1660 }
1661
1662 /* If all IOs have finished must free it */
1663 if (lio_context->io_completed == lio_context->io_issued) {
1664 free_context = TRUE;
1665 }
1666
1667 break;
1668
1669 case LIO_NOWAIT:
1670 /* If no IOs were issued must free it (rdar://problem/45717887) */
1671 if (lio_context->io_issued == 0) {
1672 free_context = TRUE;
1673 }
1674 break;
1675 }
1676 aio_proc_unlock(p);
1677
1678 /* call_result == -1 means we had no trouble queueing up requests */
1679 if (call_result == -1) {
1680 call_result = 0;
1681 *retval = 0;
1682 }
1683
1684 ExitRoutine:
1685 if (entryp_listp != NULL) {
1686 FREE( entryp_listp, M_TEMP );
1687 }
1688 if (aiocbpp != NULL) {
1689 FREE( aiocbpp, M_TEMP );
1690 }
1691 if (free_context) {
1692 free_lio_context(lio_context);
1693 }
1694
1695 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1696 (int)p, call_result, 0, 0, 0 );
1697
1698 return call_result;
1699 } /* lio_listio */
1700
1701
1702 /*
1703 * aio worker thread. this is where all the real work gets done.
1704 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1705 * after new work is queued up.
1706 */
1707 __attribute__((noreturn))
1708 static void
1709 aio_work_thread(void)
1710 {
1711 aio_workq_entry *entryp;
1712 int error;
1713 vm_map_t currentmap;
1714 vm_map_t oldmap = VM_MAP_NULL;
1715 task_t oldaiotask = TASK_NULL;
1716 struct uthread *uthreadp = NULL;
1717
1718 for (;;) {
1719 /*
1720 * returns with the entry ref'ed.
1721 * sleeps until work is available.
1722 */
1723 entryp = aio_get_some_work();
1724
1725 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1726 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1727
1728 /*
1729 * Assume the target's address space identity for the duration
1730 * of the IO. Note: don't need to have the entryp locked,
1731 * because the proc and map don't change until it's freed.
1732 */
1733 currentmap = get_task_map((current_proc())->task );
1734 if (currentmap != entryp->aio_map) {
1735 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1736 oldaiotask = uthreadp->uu_aio_task;
1737 uthreadp->uu_aio_task = entryp->procp->task;
1738 oldmap = vm_map_switch( entryp->aio_map );
1739 }
1740
1741 if ((entryp->flags & AIO_READ) != 0) {
1742 error = do_aio_read( entryp );
1743 } else if ((entryp->flags & AIO_WRITE) != 0) {
1744 error = do_aio_write( entryp );
1745 } else if ((entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0) {
1746 error = do_aio_fsync( entryp );
1747 } else {
1748 printf( "%s - unknown aio request - flags 0x%02X \n",
1749 __FUNCTION__, entryp->flags );
1750 error = EINVAL;
1751 }
1752
1753 /* Restore old map */
1754 if (currentmap != entryp->aio_map) {
1755 (void) vm_map_switch( oldmap );
1756 uthreadp->uu_aio_task = oldaiotask;
1757 }
1758
1759 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1760 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1761 entryp->returnval, 0 );
1762
1763
1764 /* XXX COUNTS */
1765 aio_entry_lock_spin(entryp);
1766 entryp->errorval = error;
1767 aio_entry_unlock(entryp);
1768
1769 /* we're done with the IO request so pop it off the active queue and */
1770 /* push it on the done queue */
1771 aio_proc_lock(entryp->procp);
1772 aio_proc_move_done_locked(entryp->procp, entryp);
1773 aio_proc_unlock(entryp->procp);
1774
1775 OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1776
1777 /* remove our reference to the user land map. */
1778 if (VM_MAP_NULL != entryp->aio_map) {
1779 vm_map_t my_map;
1780
1781 my_map = entryp->aio_map;
1782 entryp->aio_map = VM_MAP_NULL;
1783 vm_map_deallocate( my_map );
1784 }
1785
1786 /* Provide notifications */
1787 do_aio_completion( entryp );
1788
1789 /* Will free if needed */
1790 aio_entry_unref(entryp);
1791 } /* for ( ;; ) */
1792
1793 /* NOT REACHED */
1794 } /* aio_work_thread */
1795
1796
1797 /*
1798 * aio_get_some_work - get the next async IO request that is ready to be executed.
1799 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1800 * IO requests at the time the aio_fsync call came in have completed.
1801 * NOTE - AIO_LOCK must be held by caller
1802 */
1803 static aio_workq_entry *
1804 aio_get_some_work( void )
1805 {
1806 aio_workq_entry *entryp = NULL;
1807 aio_workq_t queue = NULL;
1808
1809 /* Just one queue for the moment. In the future there will be many. */
1810 queue = &aio_anchor.aio_async_workqs[0];
1811 aio_workq_lock_spin(queue);
1812 if (queue->aioq_count == 0) {
1813 goto nowork;
1814 }
1815
1816 /*
1817 * Hold the queue lock.
1818 *
1819 * pop some work off the work queue and add to our active queue
1820 * Always start with the queue lock held.
1821 */
1822 for (;;) {
1823 /*
1824 * Pull of of work queue. Once it's off, it can't be cancelled,
1825 * so we can take our ref once we drop the queue lock.
1826 */
1827 entryp = TAILQ_FIRST(&queue->aioq_entries);
1828
1829 /*
1830 * If there's no work or only fsyncs that need delay, go to sleep
1831 * and then start anew from aio_work_thread
1832 */
1833 if (entryp == NULL) {
1834 goto nowork;
1835 }
1836
1837 aio_workq_remove_entry_locked(queue, entryp);
1838
1839 aio_workq_unlock(queue);
1840
1841 /*
1842 * Check if it's an fsync that must be delayed. No need to lock the entry;
1843 * that flag would have been set at initialization.
1844 */
1845 if ((entryp->flags & AIO_FSYNC) != 0) {
1846 /*
1847 * Check for unfinished operations on the same file
1848 * in this proc's queue.
1849 */
1850 aio_proc_lock_spin(entryp->procp);
1851 if (aio_delay_fsync_request( entryp )) {
1852 /* It needs to be delayed. Put it back on the end of the work queue */
1853 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1854 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1855
1856 aio_proc_unlock(entryp->procp);
1857
1858 aio_workq_lock_spin(queue);
1859 aio_workq_add_entry_locked(queue, entryp);
1860 continue;
1861 }
1862 aio_proc_unlock(entryp->procp);
1863 }
1864
1865 break;
1866 }
1867
1868 aio_entry_ref(entryp);
1869
1870 OSIncrementAtomic(&aio_anchor.aio_inflight_count);
1871 return entryp;
1872
1873 nowork:
1874 /* We will wake up when someone enqueues something */
1875 waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
1876 aio_workq_unlock(queue);
1877 thread_block((thread_continue_t)aio_work_thread );
1878
1879 // notreached
1880 return NULL;
1881 }
1882
1883 /*
1884 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1885 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1886 * not been completed.
1887 */
1888 static boolean_t
1889 aio_delay_fsync_request( aio_workq_entry *entryp )
1890 {
1891 if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1892 return FALSE;
1893 }
1894
1895 return TRUE;
1896 } /* aio_delay_fsync_request */
1897
1898 static aio_workq_entry *
1899 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
1900 {
1901 aio_workq_entry *entryp;
1902 int result = 0;
1903
1904 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1905 if (entryp == NULL) {
1906 result = EAGAIN;
1907 goto error_exit;
1908 }
1909
1910 bzero( entryp, sizeof(*entryp));
1911
1912 /* fill in the rest of the aio_workq_entry */
1913 entryp->procp = procp;
1914 entryp->uaiocbp = aiocbp;
1915 entryp->flags |= kindOfIO;
1916 entryp->group_tag = group_tag;
1917 entryp->aio_map = VM_MAP_NULL;
1918 entryp->aio_refcount = 0;
1919
1920 if (proc_is64bit(procp)) {
1921 struct user64_aiocb aiocb64;
1922
1923 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64));
1924 if (result == 0) {
1925 do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1926 }
1927 } else {
1928 struct user32_aiocb aiocb32;
1929
1930 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32));
1931 if (result == 0) {
1932 do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
1933 }
1934 }
1935
1936 if (result != 0) {
1937 result = EAGAIN;
1938 goto error_exit;
1939 }
1940
1941 /* get a reference to the user land map in order to keep it around */
1942 entryp->aio_map = get_task_map( procp->task );
1943 vm_map_reference( entryp->aio_map );
1944
1945 /* do some more validation on the aiocb and embedded file descriptor */
1946 result = aio_validate( entryp );
1947 if (result != 0) {
1948 goto error_exit_with_ref;
1949 }
1950
1951 /* get a reference on the current_thread, which is passed in vfs_context. */
1952 entryp->thread = current_thread();
1953 thread_reference( entryp->thread );
1954 return entryp;
1955
1956 error_exit_with_ref:
1957 if (VM_MAP_NULL != entryp->aio_map) {
1958 vm_map_deallocate( entryp->aio_map );
1959 }
1960 error_exit:
1961 if (result && entryp != NULL) {
1962 zfree( aio_workq_zonep, entryp );
1963 entryp = NULL;
1964 }
1965
1966 return entryp;
1967 }
1968
1969
1970 /*
1971 * aio_queue_async_request - queue up an async IO request on our work queue then
1972 * wake up one of our worker threads to do the actual work. We get a reference
1973 * to our caller's user land map in order to keep it around while we are
1974 * processing the request.
1975 */
1976 static int
1977 aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1978 {
1979 aio_workq_entry *entryp;
1980 int result;
1981 int old_count;
1982 uint32_t *paio_offset;
1983 uint32_t *paio_nbytes;
1984
1985 old_count = aio_increment_total_count();
1986 if (old_count >= aio_max_requests) {
1987 result = EAGAIN;
1988 goto error_noalloc;
1989 }
1990
1991 entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1992 if (entryp == NULL) {
1993 result = EAGAIN;
1994 goto error_noalloc;
1995 }
1996
1997
1998 aio_proc_lock_spin(procp);
1999
2000 if (is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE) {
2001 result = EAGAIN;
2002 goto error_exit;
2003 }
2004
2005 /* check our aio limits to throttle bad or rude user land behavior */
2006 if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
2007 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
2008 result = EAGAIN;
2009 goto error_exit;
2010 }
2011
2012 /* Add the IO to proc and work queues, wake up threads as appropriate */
2013 lck_mtx_convert_spin(aio_proc_mutex(procp));
2014 aio_enqueue_work(procp, entryp, 1);
2015
2016 aio_proc_unlock(procp);
2017
2018 paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
2019 paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
2020 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_START,
2021 (int)procp, (int)aiocbp, entryp->flags, entryp->aiocb.aio_fildes, 0 );
2022 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_END,
2023 paio_offset[0], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[1] : 0),
2024 paio_nbytes[0], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[1] : 0),
2025 0 );
2026
2027 return 0;
2028
2029 error_exit:
2030 /*
2031 * This entry has not been queued up so no worries about
2032 * unlocked state and aio_map
2033 */
2034 aio_proc_unlock(procp);
2035 aio_free_request(entryp);
2036
2037 error_noalloc:
2038 aio_decrement_total_count();
2039
2040 return result;
2041 } /* aio_queue_async_request */
2042
2043
2044 /*
2045 * lio_create_entry
2046 *
2047 * Allocate an aio_workq_entry and fill it in. If all goes well return 0
2048 * and pass the aio_workq_entry pointer back to our caller.
2049 *
2050 * Parameters: procp The process makign the request
2051 * aiocbp The aio context buffer pointer
2052 * group_tag The group tag used to indicate a
2053 * group of operations has completed
2054 * entrypp Pointer to the pointer to receive the
2055 * address of the created aio_workq_entry
2056 *
2057 * Returns: 0 Successfully created
2058 * EAGAIN Try again (usually resource shortage)
2059 *
2060 *
2061 * Notes: We get a reference to our caller's user land map in order
2062 * to keep it around while we are processing the request.
2063 *
2064 * lio_listio calls behave differently at completion they do
2065 * completion notification when all async IO requests have
2066 * completed. We use group_tag to tag IO requests that behave
2067 * in the delay notification manner.
2068 *
2069 * All synchronous operations are considered to not have a
2070 * signal routine associated with them (sigp == USER_ADDR_NULL).
2071 */
2072 static int
2073 lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2074 aio_workq_entry **entrypp )
2075 {
2076 aio_workq_entry *entryp;
2077 int result;
2078
2079 entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2080 if (entryp == NULL) {
2081 result = EAGAIN;
2082 goto error_exit;
2083 }
2084
2085 /*
2086 * Look for lio_listio LIO_NOP requests and ignore them; this is
2087 * not really an error, but we need to free our aio_workq_entry.
2088 */
2089 if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
2090 result = 0;
2091 goto error_exit;
2092 }
2093
2094 *entrypp = entryp;
2095 return 0;
2096
2097 error_exit:
2098
2099 if (entryp != NULL) {
2100 /*
2101 * This entry has not been queued up so no worries about
2102 * unlocked state and aio_map
2103 */
2104 aio_free_request(entryp);
2105 }
2106
2107 return result;
2108 } /* lio_create_entry */
2109
2110
2111 /*
2112 * aio_free_request - remove our reference on the user land map and
2113 * free the work queue entry resources. The entry is off all lists
2114 * and has zero refcount, so no one can have a pointer to it.
2115 */
2116
2117 static int
2118 aio_free_request(aio_workq_entry *entryp)
2119 {
2120 /* remove our reference to the user land map. */
2121 if (VM_MAP_NULL != entryp->aio_map) {
2122 vm_map_deallocate(entryp->aio_map);
2123 }
2124
2125 /* remove our reference to thread which enqueued the request */
2126 if (NULL != entryp->thread) {
2127 thread_deallocate( entryp->thread );
2128 }
2129
2130 entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2131
2132 zfree( aio_workq_zonep, entryp );
2133
2134 return 0;
2135 } /* aio_free_request */
2136
2137
2138 /*
2139 * aio_validate
2140 *
2141 * validate the aiocb passed in by one of the aio syscalls.
2142 */
2143 static int
2144 aio_validate( aio_workq_entry *entryp )
2145 {
2146 struct fileproc *fp;
2147 int flag;
2148 int result;
2149
2150 result = 0;
2151
2152 if ((entryp->flags & AIO_LIO) != 0) {
2153 if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
2154 entryp->flags |= AIO_READ;
2155 } else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
2156 entryp->flags |= AIO_WRITE;
2157 } else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
2158 return 0;
2159 } else {
2160 return EINVAL;
2161 }
2162 }
2163
2164 flag = FREAD;
2165 if ((entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0) {
2166 flag = FWRITE;
2167 }
2168
2169 if ((entryp->flags & (AIO_READ | AIO_WRITE)) != 0) {
2170 if (entryp->aiocb.aio_nbytes > INT_MAX ||
2171 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2172 entryp->aiocb.aio_offset < 0) {
2173 return EINVAL;
2174 }
2175 }
2176
2177 result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
2178 if (result) {
2179 return result;
2180 }
2181
2182 /* validate the file descriptor and that the file was opened
2183 * for the appropriate read / write access.
2184 */
2185 proc_fdlock(entryp->procp);
2186
2187 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp, 1);
2188 if (result == 0) {
2189 if ((fp->f_fglob->fg_flag & flag) == 0) {
2190 /* we don't have read or write access */
2191 result = EBADF;
2192 } else if (FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE) {
2193 /* this is not a file */
2194 result = ESPIPE;
2195 } else {
2196 fp->f_flags |= FP_AIOISSUED;
2197 }
2198
2199 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 1);
2200 } else {
2201 result = EBADF;
2202 }
2203
2204 proc_fdunlock(entryp->procp);
2205
2206 return result;
2207 } /* aio_validate */
2208
2209 static int
2210 aio_increment_total_count()
2211 {
2212 return OSIncrementAtomic(&aio_anchor.aio_total_count);
2213 }
2214
2215 static int
2216 aio_decrement_total_count()
2217 {
2218 int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2219 if (old <= 0) {
2220 panic("Negative total AIO count!\n");
2221 }
2222
2223 return old;
2224 }
2225
2226 static int
2227 aio_get_process_count(proc_t procp )
2228 {
2229 return procp->p_aio_total_count;
2230 } /* aio_get_process_count */
2231
2232 static int
2233 aio_get_all_queues_count( void )
2234 {
2235 return aio_anchor.aio_total_count;
2236 } /* aio_get_all_queues_count */
2237
2238
2239 /*
2240 * do_aio_completion. Handle async IO completion.
2241 */
2242 static void
2243 do_aio_completion( aio_workq_entry *entryp )
2244 {
2245 boolean_t lastLioCompleted = FALSE;
2246 aio_lio_context *lio_context = NULL;
2247 int waiter = 0;
2248
2249 lio_context = (aio_lio_context *)entryp->group_tag;
2250
2251 if (lio_context != NULL) {
2252 aio_proc_lock_spin(entryp->procp);
2253
2254 /* Account for this I/O completing. */
2255 lio_context->io_completed++;
2256
2257 /* Are we done with this lio context? */
2258 if (lio_context->io_issued == lio_context->io_completed) {
2259 lastLioCompleted = TRUE;
2260 }
2261
2262 waiter = lio_context->io_waiter;
2263
2264 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2265 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2266 /* wake up the waiter */
2267 wakeup(lio_context);
2268 }
2269
2270 aio_proc_unlock(entryp->procp);
2271 }
2272
2273 if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2274 (entryp->flags & AIO_DISABLE) == 0) {
2275 boolean_t performSignal = FALSE;
2276 if (lio_context == NULL) {
2277 performSignal = TRUE;
2278 } else {
2279 /*
2280 * If this was the last request in the group and a signal
2281 * is desired, send one.
2282 */
2283 performSignal = lastLioCompleted;
2284 }
2285
2286 if (performSignal) {
2287 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
2288 (int)entryp->procp, (int)entryp->uaiocbp,
2289 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
2290
2291 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
2292 }
2293 }
2294
2295 if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2296 panic("Close and exit flags set at the same time\n");
2297 }
2298
2299 /*
2300 * need to handle case where a process is trying to exit, exec, or
2301 * close and is currently waiting for active aio requests to complete.
2302 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2303 * other requests in the active queue for this process. If there are
2304 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2305 * If there are some still active then do nothing - we only want to
2306 * wakeup when all active aio requests for the process are complete.
2307 *
2308 * Don't need to lock the entry or proc to check the cleanup flag. It can only be
2309 * set for cancellation, while the entryp is still on a proc list; now it's
2310 * off, so that flag is already set if it's going to be.
2311 */
2312 if ((entryp->flags & AIO_EXIT_WAIT) != 0) {
2313 int active_requests;
2314
2315 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2316 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2317
2318 aio_proc_lock_spin(entryp->procp);
2319 active_requests = aio_active_requests_for_process( entryp->procp );
2320 if (active_requests < 1) {
2321 /*
2322 * no active aio requests for this process, continue exiting. In this
2323 * case, there should be no one else waiting ont he proc in AIO...
2324 */
2325 wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2326 aio_proc_unlock(entryp->procp);
2327
2328 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2329 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2330 } else {
2331 aio_proc_unlock(entryp->procp);
2332 }
2333 }
2334
2335 if ((entryp->flags & AIO_CLOSE_WAIT) != 0) {
2336 int active_requests;
2337
2338 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2339 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2340
2341 aio_proc_lock_spin(entryp->procp);
2342 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2343 if (active_requests < 1) {
2344 /* Can't wakeup_one(); multiple closes might be in progress. */
2345 wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2346 aio_proc_unlock(entryp->procp);
2347
2348 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2349 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2350 } else {
2351 aio_proc_unlock(entryp->procp);
2352 }
2353 }
2354 /*
2355 * A thread in aio_suspend() wants to known about completed IOs. If it checked
2356 * the done list before we moved our AIO there, then it already asserted its wait,
2357 * and we can wake it up without holding the lock. If it checked the list after
2358 * we did our move, then it already has seen the AIO that we moved. Herego, we
2359 * can do our wakeup without holding the lock.
2360 */
2361 wakeup((caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
2362 KERNEL_DEBUG((BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2363 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2364
2365 /*
2366 * free the LIO context if the last lio completed and no thread is
2367 * waiting
2368 */
2369 if (lastLioCompleted && (waiter == 0)) {
2370 free_lio_context(lio_context);
2371 }
2372 } /* do_aio_completion */
2373
2374
2375 /*
2376 * do_aio_read
2377 */
2378 static int
2379 do_aio_read( aio_workq_entry *entryp )
2380 {
2381 struct fileproc *fp;
2382 int error;
2383 struct vfs_context context;
2384
2385 if ((error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp, 0))) {
2386 return error;
2387 }
2388 if ((fp->f_fglob->fg_flag & FREAD) == 0) {
2389 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2390 return EBADF;
2391 }
2392
2393 context.vc_thread = entryp->thread; /* XXX */
2394 context.vc_ucred = fp->f_fglob->fg_cred;
2395
2396 error = dofileread(&context, fp,
2397 entryp->aiocb.aio_buf,
2398 entryp->aiocb.aio_nbytes,
2399 entryp->aiocb.aio_offset, FOF_OFFSET,
2400 &entryp->returnval);
2401 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2402
2403 return error;
2404 } /* do_aio_read */
2405
2406
2407 /*
2408 * do_aio_write
2409 */
2410 static int
2411 do_aio_write( aio_workq_entry *entryp )
2412 {
2413 struct fileproc *fp;
2414 int error, flags;
2415 struct vfs_context context;
2416
2417 if ((error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp, 0))) {
2418 return error;
2419 }
2420 if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
2421 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2422 return EBADF;
2423 }
2424
2425 flags = FOF_PCRED;
2426 if ((fp->f_fglob->fg_flag & O_APPEND) == 0) {
2427 flags |= FOF_OFFSET;
2428 }
2429
2430 context.vc_thread = entryp->thread; /* XXX */
2431 context.vc_ucred = fp->f_fglob->fg_cred;
2432
2433 /* NB: tell dofilewrite the offset, and to use the proc cred */
2434 error = dofilewrite(&context,
2435 fp,
2436 entryp->aiocb.aio_buf,
2437 entryp->aiocb.aio_nbytes,
2438 entryp->aiocb.aio_offset,
2439 flags,
2440 &entryp->returnval);
2441
2442 if (entryp->returnval) {
2443 fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
2444 } else {
2445 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2446 }
2447
2448 return error;
2449 } /* do_aio_write */
2450
2451
2452 /*
2453 * aio_active_requests_for_process - return number of active async IO
2454 * requests for the given process.
2455 */
2456 static int
2457 aio_active_requests_for_process(proc_t procp )
2458 {
2459 return procp->p_aio_active_count;
2460 } /* aio_active_requests_for_process */
2461
2462 /*
2463 * Called with the proc locked.
2464 */
2465 static int
2466 aio_proc_active_requests_for_file(proc_t procp, int fd)
2467 {
2468 int count = 0;
2469 aio_workq_entry *entryp;
2470 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2471 if (entryp->aiocb.aio_fildes == fd) {
2472 count++;
2473 }
2474 }
2475
2476 return count;
2477 } /* aio_active_requests_for_process */
2478
2479
2480
2481 /*
2482 * do_aio_fsync
2483 */
2484 static int
2485 do_aio_fsync( aio_workq_entry *entryp )
2486 {
2487 struct vfs_context context;
2488 struct vnode *vp;
2489 struct fileproc *fp;
2490 int sync_flag;
2491 int error;
2492
2493 /*
2494 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2495 *
2496 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2497 * to mark for update the metadata not strictly necessary for data
2498 * retrieval, rather than forcing it to disk.
2499 *
2500 * If AIO_FSYNC is set, we have to also wait for metadata not really
2501 * necessary to data retrival are committed to stable storage (e.g.
2502 * atime, mtime, ctime, etc.).
2503 *
2504 * Metadata necessary for data retrieval ust be committed to stable
2505 * storage in either case (file length, etc.).
2506 */
2507 if (entryp->flags & AIO_FSYNC) {
2508 sync_flag = MNT_WAIT;
2509 } else {
2510 sync_flag = MNT_DWAIT;
2511 }
2512
2513 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2514 if (error == 0) {
2515 if ((error = vnode_getwithref(vp))) {
2516 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2517 entryp->returnval = -1;
2518 return error;
2519 }
2520 context.vc_thread = current_thread();
2521 context.vc_ucred = fp->f_fglob->fg_cred;
2522
2523 error = VNOP_FSYNC( vp, sync_flag, &context);
2524
2525 (void)vnode_put(vp);
2526
2527 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2528 }
2529 if (error != 0) {
2530 entryp->returnval = -1;
2531 }
2532
2533 return error;
2534 } /* do_aio_fsync */
2535
2536
2537 /*
2538 * is_already_queued - runs through our queues to see if the given
2539 * aiocbp / process is there. Returns TRUE if there is a match
2540 * on any of our aio queues.
2541 *
2542 * Called with proc aio lock held (can be held spin)
2543 */
2544 static boolean_t
2545 is_already_queued(proc_t procp,
2546 user_addr_t aiocbp )
2547 {
2548 aio_workq_entry *entryp;
2549 boolean_t result;
2550
2551 result = FALSE;
2552
2553 /* look for matches on our queue of async IO requests that have completed */
2554 TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
2555 if (aiocbp == entryp->uaiocbp) {
2556 result = TRUE;
2557 goto ExitThisRoutine;
2558 }
2559 }
2560
2561 /* look for matches on our queue of active async IO requests */
2562 TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
2563 if (aiocbp == entryp->uaiocbp) {
2564 result = TRUE;
2565 goto ExitThisRoutine;
2566 }
2567 }
2568
2569 ExitThisRoutine:
2570 return result;
2571 } /* is_already_queued */
2572
2573
2574 static void
2575 free_lio_context(aio_lio_context* context)
2576 {
2577 #if DEBUG
2578 OSDecrementAtomic(&lio_contexts_alloced);
2579 #endif /* DEBUG */
2580
2581 FREE( context, M_TEMP );
2582 } /* free_lio_context */
2583
2584
2585 /*
2586 * aio initialization
2587 */
2588 __private_extern__ void
2589 aio_init( void )
2590 {
2591 int i;
2592
2593 aio_lock_grp_attr = lck_grp_attr_alloc_init();
2594 aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2595 aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2596 aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
2597 aio_lock_attr = lck_attr_alloc_init();
2598
2599 lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2600 lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
2601
2602 aio_anchor.aio_inflight_count = 0;
2603 aio_anchor.aio_done_count = 0;
2604 aio_anchor.aio_total_count = 0;
2605 aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2606
2607 for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2608 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2609 }
2610
2611
2612 i = sizeof(aio_workq_entry);
2613 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2614
2615 _aio_create_worker_threads( aio_worker_threads );
2616 } /* aio_init */
2617
2618
2619 /*
2620 * aio worker threads created here.
2621 */
2622 __private_extern__ void
2623 _aio_create_worker_threads( int num )
2624 {
2625 int i;
2626
2627 /* create some worker threads to handle the async IO requests */
2628 for (i = 0; i < num; i++) {
2629 thread_t myThread;
2630
2631 if (KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread)) {
2632 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2633 } else {
2634 thread_deallocate(myThread);
2635 }
2636 }
2637
2638 return;
2639 } /* _aio_create_worker_threads */
2640
2641 /*
2642 * Return the current activation utask
2643 */
2644 task_t
2645 get_aiotask(void)
2646 {
2647 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2648 }
2649
2650
2651 /*
2652 * In the case of an aiocb from a
2653 * 32-bit process we need to expand some longs and pointers to the correct
2654 * sizes in order to let downstream code always work on the same type of
2655 * aiocb (in our case that is a user_aiocb)
2656 */
2657 static void
2658 do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2659 {
2660 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2661 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2662 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2663 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2664 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2665 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2666
2667 /* special case here. since we do not know if sigev_value is an */
2668 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2669 /* means if we send this info back to user space we need to remember */
2670 /* sigev_value was not expanded for the 32-bit case. */
2671 /* NOTE - this does NOT affect us since we don't support sigev_value */
2672 /* yet in the aio context. */
2673 //LP64
2674 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2675 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2676 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2677 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2678 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2679 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2680 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2681 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2682 }
2683
2684 /* Similar for 64-bit user process, so that we don't need to satisfy
2685 * the alignment constraints of the original user64_aiocb
2686 */
2687 static void
2688 do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2689 {
2690 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2691 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2692 the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2693 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2694 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2695 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2696
2697 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2698 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2699 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2700 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2701 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2702 my_aiocbp->aio_sigevent.sigev_notify_function;
2703 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2704 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2705 }