]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_aio.c
xnu-1504.15.3.tar.gz
[apple/xnu.git] / bsd / kern / kern_aio.c
1 /*
2 * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 /*
31 * todo:
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
38 */
39
40
41 /*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
57 #include <sys/user.h>
58
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
61
62 #include <machine/limits.h>
63
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/zalloc.h>
67 #include <kern/task.h>
68 #include <kern/sched_prim.h>
69
70 #include <vm/vm_map.h>
71
72 #include <libkern/OSAtomic.h>
73
74 #include <sys/kdebug.h>
75 #define AIO_work_queued 1
76 #define AIO_worker_wake 2
77 #define AIO_completion_sig 3
78 #define AIO_completion_cleanup_wait 4
79 #define AIO_completion_cleanup_wake 5
80 #define AIO_completion_suspend_wake 6
81 #define AIO_fsync_delay 7
82 #define AIO_cancel 10
83 #define AIO_cancel_async_workq 11
84 #define AIO_cancel_sync_workq 12
85 #define AIO_cancel_activeq 13
86 #define AIO_cancel_doneq 14
87 #define AIO_fsync 20
88 #define AIO_read 30
89 #define AIO_write 40
90 #define AIO_listio 50
91 #define AIO_error 60
92 #define AIO_error_val 61
93 #define AIO_error_activeq 62
94 #define AIO_error_workq 63
95 #define AIO_return 70
96 #define AIO_return_val 71
97 #define AIO_return_activeq 72
98 #define AIO_return_workq 73
99 #define AIO_exec 80
100 #define AIO_exit 90
101 #define AIO_exit_sleep 91
102 #define AIO_close 100
103 #define AIO_close_sleep 101
104 #define AIO_suspend 110
105 #define AIO_suspend_sleep 111
106 #define AIO_worker_thread 120
107
108 #if 0
109 #undef KERNEL_DEBUG
110 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
111 #endif
112
113 /*
114 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
115 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
116 * (proc.aio_activeq) when one of our worker threads start the IO.
117 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
118 * when the IO request completes. The request remains on aio_doneq until
119 * user process calls aio_return or the process exits, either way that is our
120 * trigger to release aio resources.
121 */
122 typedef struct aio_workq {
123 TAILQ_HEAD(, aio_workq_entry) aioq_entries;
124 int aioq_count;
125 lck_mtx_t aioq_mtx;
126 wait_queue_t aioq_waitq;
127 } *aio_workq_t;
128
129 #define AIO_NUM_WORK_QUEUES 1
130 struct aio_anchor_cb
131 {
132 volatile int32_t aio_inflight_count; /* entries that have been taken from a workq */
133 volatile int32_t aio_done_count; /* entries on all done queues (proc.aio_doneq) */
134 volatile int32_t aio_total_count; /* total extant entries */
135
136 /* Hash table of queues here */
137 int aio_num_workqs;
138 struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
139 };
140 typedef struct aio_anchor_cb aio_anchor_cb;
141
142 struct aio_lio_context
143 {
144 int io_waiter;
145 int io_issued;
146 int io_completed;
147 };
148 typedef struct aio_lio_context aio_lio_context;
149
150
151 /*
152 * Notes on aio sleep / wake channels.
153 * We currently pick a couple fields within the proc structure that will allow
154 * us sleep channels that currently do not collide with any other kernel routines.
155 * At this time, for binary compatibility reasons, we cannot create new proc fields.
156 */
157 #define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
158 #define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
159
160 #define ASSERT_AIO_FROM_PROC(aiop, theproc) \
161 if ((aiop)->procp != (theproc)) { \
162 panic("AIO on a proc list that does not belong to that proc.\n"); \
163 }
164
165 /*
166 * LOCAL PROTOTYPES
167 */
168 static void aio_proc_lock(proc_t procp);
169 static void aio_proc_lock_spin(proc_t procp);
170 static void aio_proc_unlock(proc_t procp);
171 static lck_mtx_t* aio_proc_mutex(proc_t procp);
172 static void aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
173 static void aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
174 static int aio_get_process_count(proc_t procp );
175 static int aio_active_requests_for_process(proc_t procp );
176 static int aio_proc_active_requests_for_file(proc_t procp, int fd);
177 static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp );
178 static boolean_t should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
179
180 static void aio_entry_lock(aio_workq_entry *entryp);
181 static void aio_entry_lock_spin(aio_workq_entry *entryp);
182 static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
183 static lck_mtx_t* aio_entry_mutex(__unused aio_workq_entry *entryp);
184 static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
185 static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
186 static void aio_entry_ref_locked(aio_workq_entry *entryp);
187 static void aio_entry_unref_locked(aio_workq_entry *entryp);
188 static void aio_entry_ref(aio_workq_entry *entryp);
189 static void aio_entry_unref(aio_workq_entry *entryp);
190 static void aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
191 int wait_for_completion, boolean_t disable_notification);
192 static int aio_entry_try_workq_remove(aio_workq_entry *entryp);
193 static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
194 static int aio_free_request(aio_workq_entry *entryp);
195
196 static void aio_workq_init(aio_workq_t wq);
197 static void aio_workq_lock_spin(aio_workq_t wq);
198 static void aio_workq_unlock(aio_workq_t wq);
199 static lck_mtx_t* aio_workq_mutex(aio_workq_t wq);
200
201 static void aio_work_thread( void );
202 static aio_workq_entry *aio_get_some_work( void );
203
204 static int aio_get_all_queues_count( void );
205 static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
206 static int aio_validate( aio_workq_entry *entryp );
207 static int aio_increment_total_count(void);
208 static int aio_decrement_total_count(void);
209
210 static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
211 static void do_aio_completion( aio_workq_entry *entryp );
212 static int do_aio_fsync( aio_workq_entry *entryp );
213 static int do_aio_read( aio_workq_entry *entryp );
214 static int do_aio_write( aio_workq_entry *entryp );
215 static void do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
216 static void do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
217 static int lio_create_entry(proc_t procp,
218 user_addr_t aiocbp,
219 void *group_tag,
220 aio_workq_entry **entrypp );
221 static aio_workq_entry *aio_create_queue_entry(proc_t procp,
222 user_addr_t aiocbp,
223 void *group_tag,
224 int kindOfIO);
225 static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
226 static void free_lio_context(aio_lio_context* context);
227 static void aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
228
229 #define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
230 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
231 #define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
232
233 /*
234 * EXTERNAL PROTOTYPES
235 */
236
237 /* in ...bsd/kern/sys_generic.c */
238 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
239 user_addr_t bufp, user_size_t nbyte,
240 off_t offset, int flags, user_ssize_t *retval );
241 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
242 user_addr_t bufp, user_size_t nbyte, off_t offset,
243 int flags, user_ssize_t *retval );
244 #if DEBUG
245 static uint32_t lio_contexts_alloced = 0;
246 #endif /* DEBUG */
247
248 /*
249 * aio external global variables.
250 */
251 extern int aio_max_requests; /* AIO_MAX - configurable */
252 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
253 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
254
255
256 /*
257 * aio static variables.
258 */
259 static aio_anchor_cb aio_anchor;
260 static lck_grp_t *aio_proc_lock_grp;
261 static lck_grp_t *aio_entry_lock_grp;
262 static lck_grp_t *aio_queue_lock_grp;
263 static lck_attr_t *aio_lock_attr;
264 static lck_grp_attr_t *aio_lock_grp_attr;
265 static struct zone *aio_workq_zonep;
266 static lck_mtx_t aio_entry_mtx;
267 static lck_mtx_t aio_proc_mtx;
268
269 static void
270 aio_entry_lock(__unused aio_workq_entry *entryp)
271 {
272 lck_mtx_lock(&aio_entry_mtx);
273 }
274
275 static void
276 aio_entry_lock_spin(__unused aio_workq_entry *entryp)
277 {
278 lck_mtx_lock_spin(&aio_entry_mtx);
279 }
280
281 static void
282 aio_entry_unlock(__unused aio_workq_entry *entryp)
283 {
284 lck_mtx_unlock(&aio_entry_mtx);
285 }
286
287 /* Hash */
288 static aio_workq_t
289 aio_entry_workq(__unused aio_workq_entry *entryp)
290 {
291 return &aio_anchor.aio_async_workqs[0];
292 }
293
294 static lck_mtx_t*
295 aio_entry_mutex(__unused aio_workq_entry *entryp)
296 {
297 return &aio_entry_mtx;
298 }
299
300 static void
301 aio_workq_init(aio_workq_t wq)
302 {
303 TAILQ_INIT(&wq->aioq_entries);
304 wq->aioq_count = 0;
305 lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
306 wq->aioq_waitq = wait_queue_alloc(SYNC_POLICY_FIFO);
307 }
308
309
310 /*
311 * Can be passed a queue which is locked spin.
312 */
313 static void
314 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
315 {
316 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
317
318 if (entryp->aio_workq_link.tqe_prev == NULL) {
319 panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
320 }
321
322 TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
323 queue->aioq_count--;
324 entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
325
326 if (queue->aioq_count < 0) {
327 panic("Negative count on a queue.\n");
328 }
329 }
330
331 static void
332 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
333 {
334 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
335
336 TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
337 if (queue->aioq_count < 0) {
338 panic("Negative count on a queue.\n");
339 }
340 queue->aioq_count++;
341 }
342
343 static void
344 aio_proc_lock(proc_t procp)
345 {
346 lck_mtx_lock(aio_proc_mutex(procp));
347 }
348
349 static void
350 aio_proc_lock_spin(proc_t procp)
351 {
352 lck_mtx_lock_spin(aio_proc_mutex(procp));
353 }
354
355 static void
356 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
357 {
358 ASSERT_AIO_PROC_LOCK_OWNED(procp);
359
360 TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
361 TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
362 procp->p_aio_active_count--;
363 OSIncrementAtomic(&aio_anchor.aio_done_count);
364 }
365
366 static void
367 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
368 {
369 TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
370 OSDecrementAtomic(&aio_anchor.aio_done_count);
371 aio_decrement_total_count();
372 procp->p_aio_total_count--;
373 }
374
375 static void
376 aio_proc_unlock(proc_t procp)
377 {
378 lck_mtx_unlock(aio_proc_mutex(procp));
379 }
380
381 static lck_mtx_t*
382 aio_proc_mutex(proc_t procp)
383 {
384 return &procp->p_mlock;
385 }
386
387 static void
388 aio_entry_ref_locked(aio_workq_entry *entryp)
389 {
390 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
391
392 if (entryp->aio_refcount < 0) {
393 panic("AIO workq entry with a negative refcount.\n");
394 }
395 entryp->aio_refcount++;
396 }
397
398
399 /* Return 1 if you've freed it */
400 static void
401 aio_entry_unref_locked(aio_workq_entry *entryp)
402 {
403 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
404
405 entryp->aio_refcount--;
406 if (entryp->aio_refcount < 0) {
407 panic("AIO workq entry with a negative refcount.\n");
408 }
409 }
410
411 static void
412 aio_entry_ref(aio_workq_entry *entryp)
413 {
414 aio_entry_lock_spin(entryp);
415 aio_entry_ref_locked(entryp);
416 aio_entry_unlock(entryp);
417 }
418 static void
419 aio_entry_unref(aio_workq_entry *entryp)
420 {
421 aio_entry_lock_spin(entryp);
422 aio_entry_unref_locked(entryp);
423
424 if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
425 aio_entry_unlock(entryp);
426 aio_free_request(entryp);
427 } else {
428 aio_entry_unlock(entryp);
429 }
430
431 return;
432 }
433
434 static void
435 aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
436 {
437 aio_entry_lock_spin(entryp);
438
439 if (cancelled) {
440 aio_entry_ref_locked(entryp);
441 entryp->errorval = ECANCELED;
442 entryp->returnval = -1;
443 }
444
445 if ( wait_for_completion ) {
446 entryp->flags |= wait_for_completion; /* flag for special completion processing */
447 }
448
449 if ( disable_notification ) {
450 entryp->flags |= AIO_DISABLE; /* Don't want a signal */
451 }
452
453 aio_entry_unlock(entryp);
454 }
455
456 static int
457 aio_entry_try_workq_remove(aio_workq_entry *entryp)
458 {
459 /* Can only be cancelled if it's still on a work queue */
460 if (entryp->aio_workq_link.tqe_prev != NULL) {
461 aio_workq_t queue;
462
463 /* Will have to check again under the lock */
464 queue = aio_entry_workq(entryp);
465 aio_workq_lock_spin(queue);
466 if (entryp->aio_workq_link.tqe_prev != NULL) {
467 aio_workq_remove_entry_locked(queue, entryp);
468 aio_workq_unlock(queue);
469 return 1;
470 } else {
471 aio_workq_unlock(queue);
472 }
473 }
474
475 return 0;
476 }
477
478 static void
479 aio_workq_lock_spin(aio_workq_t wq)
480 {
481 lck_mtx_lock_spin(aio_workq_mutex(wq));
482 }
483
484 static void
485 aio_workq_unlock(aio_workq_t wq)
486 {
487 lck_mtx_unlock(aio_workq_mutex(wq));
488 }
489
490 static lck_mtx_t*
491 aio_workq_mutex(aio_workq_t wq)
492 {
493 return &wq->aioq_mtx;
494 }
495
496 /*
497 * aio_cancel - attempt to cancel one or more async IO requests currently
498 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
499 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
500 * is NULL then all outstanding async IO request for the given file
501 * descriptor are cancelled (if possible).
502 */
503 int
504 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
505 {
506 struct user_aiocb my_aiocb;
507 int result;
508
509 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
510 (int)p, (int)uap->aiocbp, 0, 0, 0 );
511
512 /* quick check to see if there are any async IO requests queued up */
513 if (aio_get_all_queues_count() < 1) {
514 result = 0;
515 *retval = AIO_ALLDONE;
516 goto ExitRoutine;
517 }
518
519 *retval = -1;
520 if ( uap->aiocbp != USER_ADDR_NULL ) {
521 if ( proc_is64bit(p) ) {
522 struct user64_aiocb aiocb64;
523
524 result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
525 if (result == 0 )
526 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
527
528 } else {
529 struct user32_aiocb aiocb32;
530
531 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
532 if ( result == 0 )
533 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
534 }
535
536 if ( result != 0 ) {
537 result = EAGAIN;
538 goto ExitRoutine;
539 }
540
541 /* NOTE - POSIX standard says a mismatch between the file */
542 /* descriptor passed in and the file descriptor embedded in */
543 /* the aiocb causes unspecified results. We return EBADF in */
544 /* that situation. */
545 if ( uap->fd != my_aiocb.aio_fildes ) {
546 result = EBADF;
547 goto ExitRoutine;
548 }
549 }
550
551 aio_proc_lock(p);
552 result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
553 ASSERT_AIO_PROC_LOCK_OWNED(p);
554 aio_proc_unlock(p);
555
556 if ( result != -1 ) {
557 *retval = result;
558 result = 0;
559 goto ExitRoutine;
560 }
561
562 result = EBADF;
563
564 ExitRoutine:
565 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
566 (int)p, (int)uap->aiocbp, result, 0, 0 );
567
568 return( result );
569
570 } /* aio_cancel */
571
572
573 /*
574 * _aio_close - internal function used to clean up async IO requests for
575 * a file descriptor that is closing.
576 * THIS MAY BLOCK.
577 */
578 __private_extern__ void
579 _aio_close(proc_t p, int fd )
580 {
581 int error;
582
583 /* quick check to see if there are any async IO requests queued up */
584 if (aio_get_all_queues_count() < 1) {
585 return;
586 }
587
588 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
589 (int)p, fd, 0, 0, 0 );
590
591 /* cancel all async IO requests on our todo queues for this file descriptor */
592 aio_proc_lock(p);
593 error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
594 ASSERT_AIO_PROC_LOCK_OWNED(p);
595 if ( error == AIO_NOTCANCELED ) {
596 /*
597 * AIO_NOTCANCELED is returned when we find an aio request for this process
598 * and file descriptor on the active async IO queue. Active requests cannot
599 * be cancelled so we must wait for them to complete. We will get a special
600 * wake up call on our channel used to sleep for ALL active requests to
601 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
602 * when we must wait for all active aio requests.
603 */
604
605 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
606 (int)p, fd, 0, 0, 0 );
607
608 while (aio_proc_active_requests_for_file(p, fd) > 0) {
609 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO | PDROP, "aio_close", 0 );
610 }
611
612 } else {
613 aio_proc_unlock(p);
614 }
615
616
617 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
618 (int)p, fd, 0, 0, 0 );
619
620 return;
621
622 } /* _aio_close */
623
624
625 /*
626 * aio_error - return the error status associated with the async IO
627 * request referred to by uap->aiocbp. The error status is the errno
628 * value that would be set by the corresponding IO request (read, wrtie,
629 * fdatasync, or sync).
630 */
631 int
632 aio_error(proc_t p, struct aio_error_args *uap, int *retval )
633 {
634 aio_workq_entry *entryp;
635 int error;
636
637 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
638 (int)p, (int)uap->aiocbp, 0, 0, 0 );
639
640 /* see if there are any aios to check */
641 if (aio_get_all_queues_count() < 1) {
642 return EINVAL;
643 }
644
645 aio_proc_lock(p);
646
647 /* look for a match on our queue of async IO requests that have completed */
648 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
649 if ( entryp->uaiocbp == uap->aiocbp ) {
650 ASSERT_AIO_FROM_PROC(entryp, p);
651
652 aio_entry_lock_spin(entryp);
653 *retval = entryp->errorval;
654 error = 0;
655 aio_entry_unlock(entryp);
656 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
657 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
658 goto ExitRoutine;
659 }
660 }
661
662 /* look for a match on our queue of active async IO requests */
663 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
664 if ( entryp->uaiocbp == uap->aiocbp ) {
665 ASSERT_AIO_FROM_PROC(entryp, p);
666 *retval = EINPROGRESS;
667 error = 0;
668 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
669 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
670 goto ExitRoutine;
671 }
672 }
673
674 error = EINVAL;
675
676 ExitRoutine:
677 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
678 (int)p, (int)uap->aiocbp, error, 0, 0 );
679 aio_proc_unlock(p);
680
681 return( error );
682
683 } /* aio_error */
684
685
686 /*
687 * aio_fsync - asynchronously force all IO operations associated
688 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
689 * queued at the time of the call to the synchronized completion state.
690 * NOTE - we do not support op O_DSYNC at this point since we do not support the
691 * fdatasync() call.
692 */
693 int
694 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
695 {
696 int error;
697 int fsync_kind;
698
699 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
700 (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
701
702 *retval = 0;
703 /* 0 := O_SYNC for binary backward compatibility with Panther */
704 if (uap->op == O_SYNC || uap->op == 0)
705 fsync_kind = AIO_FSYNC;
706 else if ( uap->op == O_DSYNC )
707 fsync_kind = AIO_DSYNC;
708 else {
709 *retval = -1;
710 error = EINVAL;
711 goto ExitRoutine;
712 }
713
714 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
715 if ( error != 0 )
716 *retval = -1;
717
718 ExitRoutine:
719 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
720 (int)p, (int)uap->aiocbp, error, 0, 0 );
721
722 return( error );
723
724 } /* aio_fsync */
725
726
727 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
728 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
729 * (uap->aiocbp->aio_buf).
730 */
731 int
732 aio_read(proc_t p, struct aio_read_args *uap, int *retval )
733 {
734 int error;
735
736 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
737 (int)p, (int)uap->aiocbp, 0, 0, 0 );
738
739 *retval = 0;
740
741 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
742 if ( error != 0 )
743 *retval = -1;
744
745 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
746 (int)p, (int)uap->aiocbp, error, 0, 0 );
747
748 return( error );
749
750 } /* aio_read */
751
752
753 /*
754 * aio_return - return the return status associated with the async IO
755 * request referred to by uap->aiocbp. The return status is the value
756 * that would be returned by corresponding IO request (read, write,
757 * fdatasync, or sync). This is where we release kernel resources
758 * held for async IO call associated with the given aiocb pointer.
759 */
760 int
761 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
762 {
763 aio_workq_entry *entryp;
764 int error;
765 boolean_t proc_lock_held = FALSE;
766
767 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
768 (int)p, (int)uap->aiocbp, 0, 0, 0 );
769
770 /* See if there are any entries to check */
771 if (aio_get_all_queues_count() < 1) {
772 error = EINVAL;
773 goto ExitRoutine;
774 }
775
776 aio_proc_lock(p);
777 proc_lock_held = TRUE;
778 *retval = 0;
779
780 /* look for a match on our queue of async IO requests that have completed */
781 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
782 ASSERT_AIO_FROM_PROC(entryp, p);
783 if ( entryp->uaiocbp == uap->aiocbp ) {
784 /* Done and valid for aio_return(), pull it off the list */
785 aio_proc_remove_done_locked(p, entryp);
786
787 /* Drop the proc lock, but keep the entry locked */
788 aio_entry_lock(entryp);
789 aio_proc_unlock(p);
790 proc_lock_held = FALSE;
791
792 *retval = entryp->returnval;
793 error = 0;
794
795 /* No references and off all lists, safe to free */
796 if (entryp->aio_refcount == 0) {
797 aio_entry_unlock(entryp);
798 aio_free_request(entryp);
799 }
800 else {
801 /* Whoever has the refcount will have to free it */
802 entryp->flags |= AIO_DO_FREE;
803 aio_entry_unlock(entryp);
804 }
805
806
807 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
808 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
809 goto ExitRoutine;
810 }
811 }
812
813 /* look for a match on our queue of active async IO requests */
814 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
815 ASSERT_AIO_FROM_PROC(entryp, p);
816 if ( entryp->uaiocbp == uap->aiocbp ) {
817 error = EINPROGRESS;
818 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
819 (int)p, (int)uap->aiocbp, *retval, 0, 0 );
820 goto ExitRoutine;
821 }
822 }
823
824 error = EINVAL;
825
826 ExitRoutine:
827 if (proc_lock_held)
828 aio_proc_unlock(p);
829 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
830 (int)p, (int)uap->aiocbp, error, 0, 0 );
831
832 return( error );
833
834 } /* aio_return */
835
836
837 /*
838 * _aio_exec - internal function used to clean up async IO requests for
839 * a process that is going away due to exec(). We cancel any async IOs
840 * we can and wait for those already active. We also disable signaling
841 * for cancelled or active aio requests that complete.
842 * This routine MAY block!
843 */
844 __private_extern__ void
845 _aio_exec(proc_t p )
846 {
847
848 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
849 (int)p, 0, 0, 0, 0 );
850
851 _aio_exit( p );
852
853 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
854 (int)p, 0, 0, 0, 0 );
855
856 return;
857
858 } /* _aio_exec */
859
860
861 /*
862 * _aio_exit - internal function used to clean up async IO requests for
863 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
864 * we can and wait for those already active. We also disable signaling
865 * for cancelled or active aio requests that complete. This routine MAY block!
866 */
867 __private_extern__ void
868 _aio_exit(proc_t p )
869 {
870 int error;
871 aio_workq_entry *entryp;
872
873
874 /* quick check to see if there are any async IO requests queued up */
875 if (aio_get_all_queues_count() < 1) {
876 return;
877 }
878
879 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
880 (int)p, 0, 0, 0, 0 );
881
882 aio_proc_lock(p);
883
884 /*
885 * cancel async IO requests on the todo work queue and wait for those
886 * already active to complete.
887 */
888 error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
889 ASSERT_AIO_PROC_LOCK_OWNED(p);
890 if ( error == AIO_NOTCANCELED ) {
891 /*
892 * AIO_NOTCANCELED is returned when we find an aio request for this process
893 * on the active async IO queue. Active requests cannot be cancelled so we
894 * must wait for them to complete. We will get a special wake up call on
895 * our channel used to sleep for ALL active requests to complete. This sleep
896 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
897 * active aio requests.
898 */
899
900 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
901 (int)p, 0, 0, 0, 0 );
902
903 while (p->p_aio_active_count != 0) {
904 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
905 }
906 }
907
908 if (p->p_aio_active_count != 0) {
909 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
910 }
911
912 /* release all aio resources used by this process */
913 entryp = TAILQ_FIRST( &p->p_aio_doneq );
914 while ( entryp != NULL ) {
915 ASSERT_AIO_FROM_PROC(entryp, p);
916 aio_workq_entry *next_entryp;
917
918 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
919 aio_proc_remove_done_locked(p, entryp);
920
921 /* we cannot free requests that are still completing */
922 aio_entry_lock_spin(entryp);
923 if (entryp->aio_refcount == 0) {
924 aio_proc_unlock(p);
925 aio_entry_unlock(entryp);
926 aio_free_request(entryp);
927
928 /* need to start over since aio_doneq may have been */
929 /* changed while we were away. */
930 aio_proc_lock(p);
931 entryp = TAILQ_FIRST( &p->p_aio_doneq );
932 continue;
933 }
934 else {
935 /* whoever has the reference will have to do the free */
936 entryp->flags |= AIO_DO_FREE;
937 }
938
939 aio_entry_unlock(entryp);
940 entryp = next_entryp;
941 }
942
943 aio_proc_unlock(p);
944
945 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
946 (int)p, 0, 0, 0, 0 );
947 return;
948
949 } /* _aio_exit */
950
951
952 static boolean_t
953 should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
954 {
955 if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
956 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
957 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
958 return TRUE;
959 }
960
961 return FALSE;
962 }
963
964 /*
965 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
966 * aio_cancel, close, and at exit.
967 * There are three modes of operation: 1) cancel all async IOs for a process -
968 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
969 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
970 * aiocbp.
971 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
972 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
973 * target async IO requests, and AIO_ALLDONE if all target async IO requests
974 * were already complete.
975 * WARNING - do not deference aiocbp in this routine, it may point to user
976 * land data that has not been copied in (when called from aio_cancel() )
977 *
978 * Called with proc locked, and returns the same way.
979 */
980 static int
981 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
982 int wait_for_completion, boolean_t disable_notification )
983 {
984 ASSERT_AIO_PROC_LOCK_OWNED(p);
985
986 aio_workq_entry *entryp;
987 int result;
988
989 result = -1;
990
991 /* look for a match on our queue of async todo work. */
992 entryp = TAILQ_FIRST(&p->p_aio_activeq);
993 while ( entryp != NULL ) {
994 ASSERT_AIO_FROM_PROC(entryp, p);
995 aio_workq_entry *next_entryp;
996
997 next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
998 if (!should_cancel(entryp, aiocbp, fd)) {
999 entryp = next_entryp;
1000 continue;
1001 }
1002
1003 /* Can only be cancelled if it's still on a work queue */
1004 if (aio_entry_try_workq_remove(entryp) != 0) {
1005 /* Have removed from workq. Update entry state and take a ref */
1006 aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
1007
1008 /* Put on the proc done queue and update counts, then unlock the proc */
1009 aio_proc_move_done_locked(p, entryp);
1010 aio_proc_unlock(p);
1011
1012 /* Now it's officially cancelled. Do the completion */
1013 result = AIO_CANCELED;
1014 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1015 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1016 do_aio_completion(entryp);
1017
1018 /* This will free if the aio_return() has already happened ... */
1019 aio_entry_unref(entryp);
1020 aio_proc_lock(p);
1021
1022 if ( aiocbp != USER_ADDR_NULL ) {
1023 return( result );
1024 }
1025
1026 /*
1027 * Restart from the head of the proc active queue since it
1028 * may have been changed while we were away doing completion
1029 * processing.
1030 *
1031 * Note that if we found an uncancellable AIO before, we will
1032 * either find it again or discover that it's been completed,
1033 * so resetting the result will not cause us to return success
1034 * despite outstanding AIOs.
1035 */
1036 entryp = TAILQ_FIRST(&p->p_aio_activeq);
1037 result = -1; /* As if beginning anew */
1038 } else {
1039 /*
1040 * It's been taken off the active queue already, i.e. is in flight.
1041 * All we can do is ask for notification.
1042 */
1043 result = AIO_NOTCANCELED;
1044
1045 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
1046 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1047
1048 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1049 aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
1050
1051 if ( aiocbp != USER_ADDR_NULL ) {
1052 return( result );
1053 }
1054 entryp = next_entryp;
1055 }
1056 } /* while... */
1057
1058 /*
1059 * if we didn't find any matches on the todo or active queues then look for a
1060 * match on our queue of async IO requests that have completed and if found
1061 * return AIO_ALLDONE result.
1062 *
1063 * Proc AIO lock is still held.
1064 */
1065 if ( result == -1 ) {
1066 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1067 ASSERT_AIO_FROM_PROC(entryp, p);
1068 if (should_cancel(entryp, aiocbp, fd)) {
1069 result = AIO_ALLDONE;
1070 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
1071 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1072
1073 if ( aiocbp != USER_ADDR_NULL ) {
1074 return( result );
1075 }
1076 }
1077 }
1078 }
1079
1080 return( result );
1081
1082 }
1083 /* do_aio_cancel_locked */
1084
1085
1086 /*
1087 * aio_suspend - suspend the calling thread until at least one of the async
1088 * IO operations referenced by uap->aiocblist has completed, until a signal
1089 * interrupts the function, or uap->timeoutp time interval (optional) has
1090 * passed.
1091 * Returns 0 if one or more async IOs have completed else -1 and errno is
1092 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1093 * woke us up.
1094 */
1095 int
1096 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1097 {
1098 __pthread_testcancel(1);
1099 return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
1100 }
1101
1102
1103 int
1104 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
1105 {
1106 int error;
1107 int i, count;
1108 uint64_t abstime;
1109 struct user_timespec ts;
1110 aio_workq_entry *entryp;
1111 user_addr_t *aiocbpp;
1112
1113 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1114 (int)p, uap->nent, 0, 0, 0 );
1115
1116 *retval = -1;
1117 abstime = 0;
1118 aiocbpp = NULL;
1119
1120 count = aio_get_all_queues_count( );
1121 if ( count < 1 ) {
1122 error = EINVAL;
1123 goto ExitThisRoutine;
1124 }
1125
1126 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
1127 error = EINVAL;
1128 goto ExitThisRoutine;
1129 }
1130
1131 if ( uap->timeoutp != USER_ADDR_NULL ) {
1132 if ( proc_is64bit(p) ) {
1133 struct user64_timespec temp;
1134 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1135 if ( error == 0 ) {
1136 ts.tv_sec = temp.tv_sec;
1137 ts.tv_nsec = temp.tv_nsec;
1138 }
1139 }
1140 else {
1141 struct user32_timespec temp;
1142 error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1143 if ( error == 0 ) {
1144 ts.tv_sec = temp.tv_sec;
1145 ts.tv_nsec = temp.tv_nsec;
1146 }
1147 }
1148 if ( error != 0 ) {
1149 error = EAGAIN;
1150 goto ExitThisRoutine;
1151 }
1152
1153 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
1154 error = EINVAL;
1155 goto ExitThisRoutine;
1156 }
1157
1158 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1159 &abstime );
1160 clock_absolutetime_interval_to_deadline( abstime, &abstime );
1161 }
1162
1163 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1164 if ( aiocbpp == NULL ) {
1165 error = EAGAIN;
1166 goto ExitThisRoutine;
1167 }
1168
1169 /* check list of aio requests to see if any have completed */
1170 check_for_our_aiocbp:
1171 aio_proc_lock_spin(p);
1172 for ( i = 0; i < uap->nent; i++ ) {
1173 user_addr_t aiocbp;
1174
1175 /* NULL elements are legal so check for 'em */
1176 aiocbp = *(aiocbpp + i);
1177 if ( aiocbp == USER_ADDR_NULL )
1178 continue;
1179
1180 /* return immediately if any aio request in the list is done */
1181 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1182 ASSERT_AIO_FROM_PROC(entryp, p);
1183 if ( entryp->uaiocbp == aiocbp ) {
1184 aio_proc_unlock(p);
1185 *retval = 0;
1186 error = 0;
1187 goto ExitThisRoutine;
1188 }
1189 }
1190 } /* for ( ; i < uap->nent; ) */
1191
1192 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1193 (int)p, uap->nent, 0, 0, 0 );
1194
1195 /*
1196 * wait for an async IO to complete or a signal fires or timeout expires.
1197 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1198 * interrupts us. If an async IO completes before a signal fires or our
1199 * timeout expires, we get a wakeup call from aio_work_thread().
1200 */
1201
1202 error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
1203 if ( error == 0 ) {
1204 /*
1205 * got our wakeup call from aio_work_thread().
1206 * Since we can get a wakeup on this channel from another thread in the
1207 * same process we head back up to make sure this is for the correct aiocbp.
1208 * If it is the correct aiocbp we will return from where we do the check
1209 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1210 * else we will fall out and just sleep again.
1211 */
1212 goto check_for_our_aiocbp;
1213 }
1214 else if ( error == EWOULDBLOCK ) {
1215 /* our timeout expired */
1216 error = EAGAIN;
1217 }
1218 else {
1219 /* we were interrupted */
1220 error = EINTR;
1221 }
1222
1223 ExitThisRoutine:
1224 if ( aiocbpp != NULL )
1225 FREE( aiocbpp, M_TEMP );
1226
1227 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1228 (int)p, uap->nent, error, 0, 0 );
1229
1230 return( error );
1231
1232 } /* aio_suspend */
1233
1234
1235 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1236 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1237 * (uap->aiocbp->aio_buf).
1238 */
1239
1240 int
1241 aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1242 {
1243 int error;
1244
1245 *retval = 0;
1246
1247 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1248 (int)p, (int)uap->aiocbp, 0, 0, 0 );
1249
1250 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1251 if ( error != 0 )
1252 *retval = -1;
1253
1254 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1255 (int)p, (int)uap->aiocbp, error, 0, 0 );
1256
1257 return( error );
1258
1259 } /* aio_write */
1260
1261
1262 static user_addr_t *
1263 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
1264 {
1265 user_addr_t *aiocbpp;
1266 int i, result;
1267
1268 /* we reserve enough space for largest possible pointer size */
1269 MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1270 if ( aiocbpp == NULL )
1271 goto err;
1272
1273 /* copyin our aiocb pointers from list */
1274 result = copyin( aiocblist, aiocbpp,
1275 proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1276 : (nent * sizeof(user32_addr_t)) );
1277 if ( result) {
1278 FREE( aiocbpp, M_TEMP );
1279 aiocbpp = NULL;
1280 goto err;
1281 }
1282
1283 /*
1284 * We depend on a list of user_addr_t's so we need to
1285 * munge and expand when these pointers came from a
1286 * 32-bit process
1287 */
1288 if ( !proc_is64bit(procp) ) {
1289 /* copy from last to first to deal with overlap */
1290 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1291 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1292
1293 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1294 *my_addrp = (user_addr_t) (*my_ptrp);
1295 }
1296 }
1297
1298 err:
1299 return (aiocbpp);
1300 }
1301
1302
1303 static int
1304 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1305 {
1306 int result = 0;
1307
1308 if (sigp == USER_ADDR_NULL)
1309 goto out;
1310
1311 /*
1312 * We need to munge aio_sigevent since it contains pointers.
1313 * Since we do not know if sigev_value is an int or a ptr we do
1314 * NOT cast the ptr to a user_addr_t. This means if we send
1315 * this info back to user space we need to remember sigev_value
1316 * was not expanded for the 32-bit case.
1317 *
1318 * Notes: This does NOT affect us since we don't support
1319 * sigev_value yet in the aio context.
1320 */
1321 if ( proc_is64bit(procp) ) {
1322 struct user64_sigevent sigevent64;
1323
1324 result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
1325 if ( result == 0 ) {
1326 sigev->sigev_notify = sigevent64.sigev_notify;
1327 sigev->sigev_signo = sigevent64.sigev_signo;
1328 sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1329 sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1330 sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1331 }
1332
1333 } else {
1334 struct user32_sigevent sigevent32;
1335
1336 result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1337 if ( result == 0 ) {
1338 sigev->sigev_notify = sigevent32.sigev_notify;
1339 sigev->sigev_signo = sigevent32.sigev_signo;
1340 sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1341 sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1342 sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1343 }
1344 }
1345
1346 if ( result != 0 ) {
1347 result = EAGAIN;
1348 }
1349
1350 out:
1351 return (result);
1352 }
1353
1354 /*
1355 * aio_enqueue_work
1356 *
1357 * Queue up the entry on the aio asynchronous work queue in priority order
1358 * based on the relative priority of the request. We calculate the relative
1359 * priority using the nice value of the caller and the value
1360 *
1361 * Parameters: procp Process queueing the I/O
1362 * entryp The work queue entry being queued
1363 *
1364 * Returns: (void) No failure modes
1365 *
1366 * Notes: This function is used for both lio_listio and aio
1367 *
1368 * XXX: At some point, we may have to consider thread priority
1369 * rather than process priority, but we don't maintain the
1370 * adjusted priority for threads the POSIX way.
1371 *
1372 *
1373 * Called with proc locked.
1374 */
1375 static void
1376 aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1377 {
1378 #if 0
1379 aio_workq_entry *my_entryp; /* used for insertion sort */
1380 #endif /* 0 */
1381 aio_workq_t queue = aio_entry_workq(entryp);
1382
1383 if (proc_locked == 0) {
1384 aio_proc_lock(procp);
1385 }
1386
1387 ASSERT_AIO_PROC_LOCK_OWNED(procp);
1388
1389 /* Onto proc queue */
1390 TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
1391 procp->p_aio_active_count++;
1392 procp->p_aio_total_count++;
1393
1394 /* And work queue */
1395 aio_workq_lock_spin(queue);
1396 aio_workq_add_entry_locked(queue, entryp);
1397 wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED);
1398 aio_workq_unlock(queue);
1399
1400 if (proc_locked == 0) {
1401 aio_proc_unlock(procp);
1402 }
1403
1404 #if 0
1405 /*
1406 * Procedure:
1407 *
1408 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1409 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1410 * which is [0..39], with 0 not being used. In nice values, the
1411 * lower the nice value, the higher the priority.
1412 * (3) The normalized scheduling prioritiy is the highest nice value
1413 * minus the current nice value. In I/O scheduling priority, the
1414 * higher the value the lower the priority, so it is the inverse
1415 * of the nice value (the higher the number, the higher the I/O
1416 * priority).
1417 * (4) From the normalized scheduling priority, we subtract the
1418 * request priority to get the request priority value number;
1419 * this means that requests are only capable of depressing their
1420 * priority relative to other requests,
1421 */
1422 entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1423
1424 /* only premit depressing the priority */
1425 if (entryp->aiocb.aio_reqprio < 0)
1426 entryp->aiocb.aio_reqprio = 0;
1427 if (entryp->aiocb.aio_reqprio > 0) {
1428 entryp->priority -= entryp->aiocb.aio_reqprio;
1429 if (entryp->priority < 0)
1430 entryp->priority = 0;
1431 }
1432
1433 /* Insertion sort the entry; lowest ->priority to highest */
1434 TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1435 if ( entryp->priority <= my_entryp->priority) {
1436 TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1437 break;
1438 }
1439 }
1440 if (my_entryp == NULL)
1441 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1442 #endif /* 0 */
1443 }
1444
1445
1446 /*
1447 * lio_listio - initiate a list of IO requests. We process the list of
1448 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1449 * (mode == LIO_NOWAIT).
1450 *
1451 * The caller gets error and return status for each aiocb in the list
1452 * via aio_error and aio_return. We must keep completed requests until
1453 * released by the aio_return call.
1454 */
1455 int
1456 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1457 {
1458 int i;
1459 int call_result;
1460 int result;
1461 int old_count;
1462 aio_workq_entry **entryp_listp;
1463 user_addr_t *aiocbpp;
1464 struct user_sigevent aiosigev;
1465 aio_lio_context *lio_context;
1466 boolean_t free_context = FALSE;
1467
1468 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1469 (int)p, uap->nent, uap->mode, 0, 0 );
1470
1471 entryp_listp = NULL;
1472 lio_context = NULL;
1473 aiocbpp = NULL;
1474 call_result = -1;
1475 *retval = -1;
1476 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1477 call_result = EINVAL;
1478 goto ExitRoutine;
1479 }
1480
1481 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1482 call_result = EINVAL;
1483 goto ExitRoutine;
1484 }
1485
1486 /*
1487 * allocate a list of aio_workq_entry pointers that we will use
1488 * to queue up all our requests at once while holding our lock.
1489 */
1490 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1491 if ( entryp_listp == NULL ) {
1492 call_result = EAGAIN;
1493 goto ExitRoutine;
1494 }
1495
1496 MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1497 if ( lio_context == NULL ) {
1498 call_result = EAGAIN;
1499 goto ExitRoutine;
1500 }
1501
1502 #if DEBUG
1503 OSIncrementAtomic(&lio_contexts_alloced);
1504 #endif /* DEBUG */
1505
1506 bzero(lio_context, sizeof(aio_lio_context));
1507
1508 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1509 if ( aiocbpp == NULL ) {
1510 call_result = EAGAIN;
1511 goto ExitRoutine;
1512 }
1513
1514 /*
1515 * Use sigevent passed in to lio_listio for each of our calls, but
1516 * only do completion notification after the last request completes.
1517 */
1518 bzero(&aiosigev, sizeof(aiosigev));
1519 /* Only copy in an sigev if the user supplied one */
1520 if (uap->sigp != USER_ADDR_NULL) {
1521 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1522 if ( call_result)
1523 goto ExitRoutine;
1524 }
1525
1526 /* process list of aio requests */
1527 lio_context->io_issued = uap->nent;
1528 lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
1529 for ( i = 0; i < uap->nent; i++ ) {
1530 user_addr_t my_aiocbp;
1531 aio_workq_entry *entryp;
1532
1533 *(entryp_listp + i) = NULL;
1534 my_aiocbp = *(aiocbpp + i);
1535
1536 /* NULL elements are legal so check for 'em */
1537 if ( my_aiocbp == USER_ADDR_NULL ) {
1538 aio_proc_lock_spin(p);
1539 lio_context->io_issued--;
1540 aio_proc_unlock(p);
1541 continue;
1542 }
1543
1544 /*
1545 * We use lio_context to mark IO requests for delayed completion
1546 * processing which means we wait until all IO requests in the
1547 * group have completed before we either return to the caller
1548 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1549 *
1550 * We use the address of the lio_context for this, since it is
1551 * unique in the address space.
1552 */
1553 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
1554 if ( result != 0 && call_result == -1 )
1555 call_result = result;
1556
1557 /* NULL elements are legal so check for 'em */
1558 entryp = *(entryp_listp + i);
1559 if ( entryp == NULL ) {
1560 aio_proc_lock_spin(p);
1561 lio_context->io_issued--;
1562 aio_proc_unlock(p);
1563 continue;
1564 }
1565
1566 if ( uap->mode == LIO_NOWAIT ) {
1567 /* Set signal hander, if any */
1568 entryp->aiocb.aio_sigevent = aiosigev;
1569 } else {
1570 /* flag that this thread blocks pending completion */
1571 entryp->flags |= AIO_LIO_NOTIFY;
1572 }
1573
1574 /* check our aio limits to throttle bad or rude user land behavior */
1575 old_count = aio_increment_total_count();
1576
1577 aio_proc_lock_spin(p);
1578 if ( old_count >= aio_max_requests ||
1579 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1580 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1581
1582 lio_context->io_issued--;
1583 aio_proc_unlock(p);
1584
1585 aio_decrement_total_count();
1586
1587 if ( call_result == -1 )
1588 call_result = EAGAIN;
1589 aio_free_request(entryp);
1590 entryp_listp[i] = NULL;
1591 continue;
1592 }
1593
1594 lck_mtx_convert_spin(aio_proc_mutex(p));
1595 aio_enqueue_work(p, entryp, 1);
1596 aio_proc_unlock(p);
1597
1598 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1599 (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1600 }
1601
1602 switch(uap->mode) {
1603 case LIO_WAIT:
1604 aio_proc_lock_spin(p);
1605 while (lio_context->io_completed < lio_context->io_issued) {
1606 result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
1607
1608 /* If we were interrupted, fail out (even if all finished) */
1609 if (result != 0) {
1610 call_result = EINTR;
1611 lio_context->io_waiter = 0;
1612 break;
1613 }
1614 }
1615
1616 /* If all IOs have finished must free it */
1617 if (lio_context->io_completed == lio_context->io_issued) {
1618 free_context = TRUE;
1619 }
1620
1621 aio_proc_unlock(p);
1622 break;
1623
1624 case LIO_NOWAIT:
1625 break;
1626 }
1627
1628 /* call_result == -1 means we had no trouble queueing up requests */
1629 if ( call_result == -1 ) {
1630 call_result = 0;
1631 *retval = 0;
1632 }
1633
1634 ExitRoutine:
1635 if ( entryp_listp != NULL )
1636 FREE( entryp_listp, M_TEMP );
1637 if ( aiocbpp != NULL )
1638 FREE( aiocbpp, M_TEMP );
1639 if ((lio_context != NULL) && ((lio_context->io_issued == 0) || (free_context == TRUE))) {
1640 free_lio_context(lio_context);
1641 }
1642
1643 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1644 (int)p, call_result, 0, 0, 0 );
1645
1646 return( call_result );
1647
1648 } /* lio_listio */
1649
1650
1651 /*
1652 * aio worker thread. this is where all the real work gets done.
1653 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1654 * after new work is queued up.
1655 */
1656 static void
1657 aio_work_thread( void )
1658 {
1659 aio_workq_entry *entryp;
1660 int error;
1661 vm_map_t currentmap;
1662 vm_map_t oldmap = VM_MAP_NULL;
1663 task_t oldaiotask = TASK_NULL;
1664 struct uthread *uthreadp = NULL;
1665
1666 for( ;; ) {
1667 /*
1668 * returns with the entry ref'ed.
1669 * sleeps until work is available.
1670 */
1671 entryp = aio_get_some_work();
1672
1673 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1674 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1675
1676 /*
1677 * Assume the target's address space identity for the duration
1678 * of the IO. Note: don't need to have the entryp locked,
1679 * because the proc and map don't change until it's freed.
1680 */
1681 currentmap = get_task_map( (current_proc())->task );
1682 if ( currentmap != entryp->aio_map ) {
1683 uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1684 oldaiotask = uthreadp->uu_aio_task;
1685 uthreadp->uu_aio_task = entryp->procp->task;
1686 oldmap = vm_map_switch( entryp->aio_map );
1687 }
1688
1689 if ( (entryp->flags & AIO_READ) != 0 ) {
1690 error = do_aio_read( entryp );
1691 }
1692 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1693 error = do_aio_write( entryp );
1694 }
1695 else if ( (entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0 ) {
1696 error = do_aio_fsync( entryp );
1697 }
1698 else {
1699 printf( "%s - unknown aio request - flags 0x%02X \n",
1700 __FUNCTION__, entryp->flags );
1701 error = EINVAL;
1702 }
1703
1704 /* Restore old map */
1705 if ( currentmap != entryp->aio_map ) {
1706 (void) vm_map_switch( oldmap );
1707 uthreadp->uu_aio_task = oldaiotask;
1708 }
1709
1710 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1711 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1712 entryp->returnval, 0 );
1713
1714
1715 /* XXX COUNTS */
1716 aio_entry_lock_spin(entryp);
1717 entryp->errorval = error;
1718 aio_entry_unlock(entryp);
1719
1720 /* we're done with the IO request so pop it off the active queue and */
1721 /* push it on the done queue */
1722 aio_proc_lock(entryp->procp);
1723 aio_proc_move_done_locked(entryp->procp, entryp);
1724 aio_proc_unlock(entryp->procp);
1725
1726 OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1727
1728 /* remove our reference to the user land map. */
1729 if ( VM_MAP_NULL != entryp->aio_map ) {
1730 vm_map_t my_map;
1731
1732 my_map = entryp->aio_map;
1733 entryp->aio_map = VM_MAP_NULL;
1734 vm_map_deallocate( my_map );
1735 }
1736
1737 /* Provide notifications */
1738 do_aio_completion( entryp );
1739
1740 /* Will free if needed */
1741 aio_entry_unref(entryp);
1742
1743 } /* for ( ;; ) */
1744
1745 /* NOT REACHED */
1746
1747 } /* aio_work_thread */
1748
1749
1750 /*
1751 * aio_get_some_work - get the next async IO request that is ready to be executed.
1752 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1753 * IO requests at the time the aio_fsync call came in have completed.
1754 * NOTE - AIO_LOCK must be held by caller
1755 */
1756 static aio_workq_entry *
1757 aio_get_some_work( void )
1758 {
1759 aio_workq_entry *entryp = NULL;
1760 aio_workq_t queue = NULL;
1761
1762 /* Just one queue for the moment. In the future there will be many. */
1763 queue = &aio_anchor.aio_async_workqs[0];
1764 aio_workq_lock_spin(queue);
1765 if (queue->aioq_count == 0) {
1766 goto nowork;
1767 }
1768
1769 /*
1770 * Hold the queue lock.
1771 *
1772 * pop some work off the work queue and add to our active queue
1773 * Always start with the queue lock held.
1774 */
1775 for(;;) {
1776 /*
1777 * Pull of of work queue. Once it's off, it can't be cancelled,
1778 * so we can take our ref once we drop the queue lock.
1779 */
1780 entryp = TAILQ_FIRST(&queue->aioq_entries);
1781
1782 /*
1783 * If there's no work or only fsyncs that need delay, go to sleep
1784 * and then start anew from aio_work_thread
1785 */
1786 if (entryp == NULL) {
1787 goto nowork;
1788 }
1789
1790 aio_workq_remove_entry_locked(queue, entryp);
1791
1792 aio_workq_unlock(queue);
1793
1794 /*
1795 * Check if it's an fsync that must be delayed. No need to lock the entry;
1796 * that flag would have been set at initialization.
1797 */
1798 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1799 /*
1800 * Check for unfinished operations on the same file
1801 * in this proc's queue.
1802 */
1803 aio_proc_lock_spin(entryp->procp);
1804 if ( aio_delay_fsync_request( entryp ) ) {
1805 /* It needs to be delayed. Put it back on the end of the work queue */
1806 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1807 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1808
1809 aio_proc_unlock(entryp->procp);
1810
1811 aio_workq_lock_spin(queue);
1812 aio_workq_add_entry_locked(queue, entryp);
1813 continue;
1814 }
1815 aio_proc_unlock(entryp->procp);
1816 }
1817
1818 break;
1819 }
1820
1821 aio_entry_ref(entryp);
1822
1823 OSIncrementAtomic(&aio_anchor.aio_inflight_count);
1824 return( entryp );
1825
1826 nowork:
1827 /* We will wake up when someone enqueues something */
1828 wait_queue_assert_wait(queue->aioq_waitq, queue, THREAD_UNINT, 0);
1829 aio_workq_unlock(queue);
1830 thread_block( (thread_continue_t)aio_work_thread );
1831
1832 // notreached
1833 return NULL;
1834 }
1835
1836 /*
1837 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1838 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1839 * not been completed.
1840 */
1841 static boolean_t
1842 aio_delay_fsync_request( aio_workq_entry *entryp )
1843 {
1844 if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1845 return FALSE;
1846 }
1847
1848 return TRUE;
1849 } /* aio_delay_fsync_request */
1850
1851 static aio_workq_entry *
1852 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
1853 {
1854 aio_workq_entry *entryp;
1855 int result = 0;
1856
1857 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1858 if ( entryp == NULL ) {
1859 result = EAGAIN;
1860 goto error_exit;
1861 }
1862
1863 bzero( entryp, sizeof(*entryp) );
1864
1865 /* fill in the rest of the aio_workq_entry */
1866 entryp->procp = procp;
1867 entryp->uaiocbp = aiocbp;
1868 entryp->flags |= kindOfIO;
1869 entryp->group_tag = group_tag;
1870 entryp->aio_map = VM_MAP_NULL;
1871 entryp->aio_refcount = 0;
1872
1873 if ( proc_is64bit(procp) ) {
1874 struct user64_aiocb aiocb64;
1875
1876 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
1877 if (result == 0 )
1878 do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1879
1880 } else {
1881 struct user32_aiocb aiocb32;
1882
1883 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1884 if ( result == 0 )
1885 do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
1886 }
1887
1888 if ( result != 0 ) {
1889 result = EAGAIN;
1890 goto error_exit;
1891 }
1892
1893 /* get a reference to the user land map in order to keep it around */
1894 entryp->aio_map = get_task_map( procp->task );
1895 vm_map_reference( entryp->aio_map );
1896
1897 /* do some more validation on the aiocb and embedded file descriptor */
1898 result = aio_validate( entryp );
1899
1900 error_exit:
1901 if ( result && entryp != NULL ) {
1902 zfree( aio_workq_zonep, entryp );
1903 entryp = NULL;
1904 }
1905
1906 return ( entryp );
1907 }
1908
1909
1910 /*
1911 * aio_queue_async_request - queue up an async IO request on our work queue then
1912 * wake up one of our worker threads to do the actual work. We get a reference
1913 * to our caller's user land map in order to keep it around while we are
1914 * processing the request.
1915 */
1916 static int
1917 aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1918 {
1919 aio_workq_entry *entryp;
1920 int result;
1921 int old_count;
1922
1923 old_count = aio_increment_total_count();
1924 if (old_count >= aio_max_requests) {
1925 result = EAGAIN;
1926 goto error_noalloc;
1927 }
1928
1929 entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1930 if ( entryp == NULL ) {
1931 result = EAGAIN;
1932 goto error_noalloc;
1933 }
1934
1935
1936 aio_proc_lock_spin(procp);
1937
1938 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1939 result = EAGAIN;
1940 goto error_exit;
1941 }
1942
1943 /* check our aio limits to throttle bad or rude user land behavior */
1944 if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
1945 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
1946 result = EAGAIN;
1947 goto error_exit;
1948 }
1949
1950 /* Add the IO to proc and work queues, wake up threads as appropriate */
1951 lck_mtx_convert_spin(aio_proc_mutex(procp));
1952 aio_enqueue_work(procp, entryp, 1);
1953
1954 aio_proc_unlock(procp);
1955
1956 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1957 (int)procp, (int)aiocbp, 0, 0, 0 );
1958
1959 return( 0 );
1960
1961 error_exit:
1962 /*
1963 * This entry has not been queued up so no worries about
1964 * unlocked state and aio_map
1965 */
1966 aio_proc_unlock(procp);
1967 aio_free_request(entryp);
1968
1969 error_noalloc:
1970 aio_decrement_total_count();
1971
1972 return( result );
1973
1974 } /* aio_queue_async_request */
1975
1976
1977 /*
1978 * lio_create_entry
1979 *
1980 * Allocate an aio_workq_entry and fill it in. If all goes well return 0
1981 * and pass the aio_workq_entry pointer back to our caller.
1982 *
1983 * Parameters: procp The process makign the request
1984 * aiocbp The aio context buffer pointer
1985 * group_tag The group tag used to indicate a
1986 * group of operations has completed
1987 * entrypp Pointer to the pointer to receive the
1988 * address of the created aio_workq_entry
1989 *
1990 * Returns: 0 Successfully created
1991 * EAGAIN Try again (usually resource shortage)
1992 *
1993 *
1994 * Notes: We get a reference to our caller's user land map in order
1995 * to keep it around while we are processing the request.
1996 *
1997 * lio_listio calls behave differently at completion they do
1998 * completion notification when all async IO requests have
1999 * completed. We use group_tag to tag IO requests that behave
2000 * in the delay notification manner.
2001 *
2002 * All synchronous operations are considered to not have a
2003 * signal routine associated with them (sigp == USER_ADDR_NULL).
2004 */
2005 static int
2006 lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2007 aio_workq_entry **entrypp )
2008 {
2009 aio_workq_entry *entryp;
2010 int result;
2011
2012 entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2013 if ( entryp == NULL ) {
2014 result = EAGAIN;
2015 goto error_exit;
2016 }
2017
2018 /*
2019 * Look for lio_listio LIO_NOP requests and ignore them; this is
2020 * not really an error, but we need to free our aio_workq_entry.
2021 */
2022 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
2023 result = 0;
2024 goto error_exit;
2025 }
2026
2027 *entrypp = entryp;
2028 return( 0 );
2029
2030 error_exit:
2031
2032 if ( entryp != NULL ) {
2033 /*
2034 * This entry has not been queued up so no worries about
2035 * unlocked state and aio_map
2036 */
2037 aio_free_request(entryp);
2038 }
2039
2040 return( result );
2041
2042 } /* lio_create_entry */
2043
2044
2045 /*
2046 * aio_free_request - remove our reference on the user land map and
2047 * free the work queue entry resources. The entry is off all lists
2048 * and has zero refcount, so no one can have a pointer to it.
2049 */
2050
2051 static int
2052 aio_free_request(aio_workq_entry *entryp)
2053 {
2054 /* remove our reference to the user land map. */
2055 if ( VM_MAP_NULL != entryp->aio_map) {
2056 vm_map_deallocate(entryp->aio_map);
2057 }
2058
2059 entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2060
2061 zfree( aio_workq_zonep, entryp );
2062
2063 return( 0 );
2064
2065 } /* aio_free_request */
2066
2067
2068 /*
2069 * aio_validate
2070 *
2071 * validate the aiocb passed in by one of the aio syscalls.
2072 */
2073 static int
2074 aio_validate( aio_workq_entry *entryp )
2075 {
2076 struct fileproc *fp;
2077 int flag;
2078 int result;
2079
2080 result = 0;
2081
2082 if ( (entryp->flags & AIO_LIO) != 0 ) {
2083 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
2084 entryp->flags |= AIO_READ;
2085 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
2086 entryp->flags |= AIO_WRITE;
2087 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
2088 return( 0 );
2089 else
2090 return( EINVAL );
2091 }
2092
2093 flag = FREAD;
2094 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0 ) {
2095 flag = FWRITE;
2096 }
2097
2098 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
2099 if ( entryp->aiocb.aio_nbytes > INT_MAX ||
2100 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2101 entryp->aiocb.aio_offset < 0 )
2102 return( EINVAL );
2103 }
2104
2105 /*
2106 * validate aiocb.aio_sigevent. at this point we only support
2107 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
2108 * sigev_value, sigev_notify_function, and sigev_notify_attributes
2109 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
2110 * with no [RTS] (RalTime Signal) option group support.
2111 */
2112 switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
2113 case SIGEV_SIGNAL:
2114 {
2115 int signum;
2116
2117 /* make sure we have a valid signal number */
2118 signum = entryp->aiocb.aio_sigevent.sigev_signo;
2119 if ( signum <= 0 || signum >= NSIG ||
2120 signum == SIGKILL || signum == SIGSTOP )
2121 return (EINVAL);
2122 }
2123 break;
2124
2125 case SIGEV_NONE:
2126 break;
2127
2128 case SIGEV_THREAD:
2129 /* Unsupported [RTS] */
2130
2131 default:
2132 return (EINVAL);
2133 }
2134
2135 /* validate the file descriptor and that the file was opened
2136 * for the appropriate read / write access.
2137 */
2138 proc_fdlock(entryp->procp);
2139
2140 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
2141 if ( result == 0 ) {
2142 if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
2143 /* we don't have read or write access */
2144 result = EBADF;
2145 }
2146 else if ( fp->f_fglob->fg_type != DTYPE_VNODE ) {
2147 /* this is not a file */
2148 result = ESPIPE;
2149 } else
2150 fp->f_flags |= FP_AIOISSUED;
2151
2152 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
2153 }
2154 else {
2155 result = EBADF;
2156 }
2157
2158 proc_fdunlock(entryp->procp);
2159
2160 return( result );
2161
2162 } /* aio_validate */
2163
2164 static int
2165 aio_increment_total_count()
2166 {
2167 return OSIncrementAtomic(&aio_anchor.aio_total_count);
2168 }
2169
2170 static int
2171 aio_decrement_total_count()
2172 {
2173 int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2174 if (old <= 0) {
2175 panic("Negative total AIO count!\n");
2176 }
2177
2178 return old;
2179 }
2180
2181 static int
2182 aio_get_process_count(proc_t procp )
2183 {
2184 return procp->p_aio_total_count;
2185
2186 } /* aio_get_process_count */
2187
2188 static int
2189 aio_get_all_queues_count( void )
2190 {
2191 return aio_anchor.aio_total_count;
2192
2193 } /* aio_get_all_queues_count */
2194
2195
2196 /*
2197 * do_aio_completion. Handle async IO completion.
2198 */
2199 static void
2200 do_aio_completion( aio_workq_entry *entryp )
2201 {
2202
2203 boolean_t lastLioCompleted = FALSE;
2204 aio_lio_context *lio_context = NULL;
2205 int waiter = 0;
2206
2207 lio_context = (aio_lio_context *)entryp->group_tag;
2208
2209 if (lio_context != NULL) {
2210
2211 aio_proc_lock_spin(entryp->procp);
2212
2213 /* Account for this I/O completing. */
2214 lio_context->io_completed++;
2215
2216 /* Are we done with this lio context? */
2217 if (lio_context->io_issued == lio_context->io_completed) {
2218 lastLioCompleted = TRUE;
2219 }
2220
2221 waiter = lio_context->io_waiter;
2222
2223 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2224 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2225 /* wake up the waiter */
2226 wakeup(lio_context);
2227 }
2228
2229 aio_proc_unlock(entryp->procp);
2230 }
2231
2232 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2233 (entryp->flags & AIO_DISABLE) == 0 ) {
2234
2235 boolean_t performSignal = FALSE;
2236 if (lio_context == NULL) {
2237 performSignal = TRUE;
2238 }
2239 else {
2240 /*
2241 * If this was the last request in the group and a signal
2242 * is desired, send one.
2243 */
2244 performSignal = lastLioCompleted;
2245 }
2246
2247 if (performSignal) {
2248
2249 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
2250 (int)entryp->procp, (int)entryp->uaiocbp,
2251 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
2252
2253 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
2254 }
2255 }
2256
2257 if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2258 panic("Close and exit flags set at the same time\n");
2259 }
2260
2261 /*
2262 * need to handle case where a process is trying to exit, exec, or
2263 * close and is currently waiting for active aio requests to complete.
2264 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2265 * other requests in the active queue for this process. If there are
2266 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2267 * If there are some still active then do nothing - we only want to
2268 * wakeup when all active aio requests for the process are complete.
2269 *
2270 * Don't need to lock the entry or proc to check the cleanup flag. It can only be
2271 * set for cancellation, while the entryp is still on a proc list; now it's
2272 * off, so that flag is already set if it's going to be.
2273 */
2274 if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) {
2275 int active_requests;
2276
2277 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2278 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2279
2280 aio_proc_lock_spin(entryp->procp);
2281 active_requests = aio_active_requests_for_process( entryp->procp );
2282 if ( active_requests < 1 ) {
2283 /*
2284 * no active aio requests for this process, continue exiting. In this
2285 * case, there should be no one else waiting ont he proc in AIO...
2286 */
2287 wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2288 aio_proc_unlock(entryp->procp);
2289
2290 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2291 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2292 } else {
2293 aio_proc_unlock(entryp->procp);
2294 }
2295 }
2296
2297 if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) {
2298 int active_requests;
2299
2300 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2301 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2302
2303 aio_proc_lock_spin(entryp->procp);
2304 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2305 if ( active_requests < 1 ) {
2306 /* Can't wakeup_one(); multiple closes might be in progress. */
2307 wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2308 aio_proc_unlock(entryp->procp);
2309
2310 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2311 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2312 } else {
2313 aio_proc_unlock(entryp->procp);
2314 }
2315 }
2316 /*
2317 * A thread in aio_suspend() wants to known about completed IOs. If it checked
2318 * the done list before we moved our AIO there, then it already asserted its wait,
2319 * and we can wake it up without holding the lock. If it checked the list after
2320 * we did our move, then it already has seen the AIO that we moved. Herego, we
2321 * can do our wakeup without holding the lock.
2322 */
2323 wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
2324 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2325 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2326
2327 /*
2328 * free the LIO context if the last lio completed and no thread is
2329 * waiting
2330 */
2331 if (lastLioCompleted && (waiter == 0))
2332 free_lio_context (lio_context);
2333
2334
2335 } /* do_aio_completion */
2336
2337
2338 /*
2339 * do_aio_read
2340 */
2341 static int
2342 do_aio_read( aio_workq_entry *entryp )
2343 {
2344 struct fileproc *fp;
2345 int error;
2346 struct vfs_context context;
2347
2348 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2349 return(error);
2350 if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2351 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2352 return(EBADF);
2353 }
2354
2355 /*
2356 * <rdar://4714366>
2357 * Needs vfs_context_t from vfs_context_create() in entryp!
2358 */
2359 context.vc_thread = proc_thread(entryp->procp); /* XXX */
2360 context.vc_ucred = fp->f_fglob->fg_cred;
2361
2362 error = dofileread(&context, fp,
2363 entryp->aiocb.aio_buf,
2364 entryp->aiocb.aio_nbytes,
2365 entryp->aiocb.aio_offset, FOF_OFFSET,
2366 &entryp->returnval);
2367 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2368
2369 return( error );
2370
2371 } /* do_aio_read */
2372
2373
2374 /*
2375 * do_aio_write
2376 */
2377 static int
2378 do_aio_write( aio_workq_entry *entryp )
2379 {
2380 struct fileproc *fp;
2381 int error, flags;
2382 struct vfs_context context;
2383
2384 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2385 return(error);
2386 if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2387 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2388 return(EBADF);
2389 }
2390
2391 flags = FOF_PCRED;
2392 if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) {
2393 flags |= FOF_OFFSET;
2394 }
2395
2396 /*
2397 * <rdar://4714366>
2398 * Needs vfs_context_t from vfs_context_create() in entryp!
2399 */
2400 context.vc_thread = proc_thread(entryp->procp); /* XXX */
2401 context.vc_ucred = fp->f_fglob->fg_cred;
2402
2403 /* NB: tell dofilewrite the offset, and to use the proc cred */
2404 error = dofilewrite(&context,
2405 fp,
2406 entryp->aiocb.aio_buf,
2407 entryp->aiocb.aio_nbytes,
2408 entryp->aiocb.aio_offset,
2409 flags,
2410 &entryp->returnval);
2411
2412 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2413
2414 return( error );
2415
2416 } /* do_aio_write */
2417
2418
2419 /*
2420 * aio_active_requests_for_process - return number of active async IO
2421 * requests for the given process.
2422 */
2423 static int
2424 aio_active_requests_for_process(proc_t procp )
2425 {
2426 return( procp->p_aio_active_count );
2427
2428 } /* aio_active_requests_for_process */
2429
2430 /*
2431 * Called with the proc locked.
2432 */
2433 static int
2434 aio_proc_active_requests_for_file(proc_t procp, int fd)
2435 {
2436 int count = 0;
2437 aio_workq_entry *entryp;
2438 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2439 if (entryp->aiocb.aio_fildes == fd) {
2440 count++;
2441 }
2442 }
2443
2444 return count;
2445 } /* aio_active_requests_for_process */
2446
2447
2448
2449 /*
2450 * do_aio_fsync
2451 */
2452 static int
2453 do_aio_fsync( aio_workq_entry *entryp )
2454 {
2455 struct vfs_context context;
2456 struct vnode *vp;
2457 struct fileproc *fp;
2458 int sync_flag;
2459 int error;
2460
2461 /*
2462 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2463 *
2464 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2465 * to mark for update the metadata not strictly necessary for data
2466 * retrieval, rather than forcing it to disk.
2467 *
2468 * If AIO_FSYNC is set, we have to also wait for metadata not really
2469 * necessary to data retrival are committed to stable storage (e.g.
2470 * atime, mtime, ctime, etc.).
2471 *
2472 * Metadata necessary for data retrieval ust be committed to stable
2473 * storage in either case (file length, etc.).
2474 */
2475 if (entryp->flags & AIO_FSYNC)
2476 sync_flag = MNT_WAIT;
2477 else
2478 sync_flag = MNT_DWAIT;
2479
2480 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2481 if ( error == 0 ) {
2482 if ( (error = vnode_getwithref(vp)) ) {
2483 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2484 entryp->returnval = -1;
2485 return(error);
2486 }
2487 context.vc_thread = current_thread();
2488 context.vc_ucred = fp->f_fglob->fg_cred;
2489
2490 error = VNOP_FSYNC( vp, sync_flag, &context);
2491
2492 (void)vnode_put(vp);
2493
2494 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2495 }
2496 if ( error != 0 )
2497 entryp->returnval = -1;
2498
2499 return( error );
2500
2501 } /* do_aio_fsync */
2502
2503
2504 /*
2505 * is_already_queued - runs through our queues to see if the given
2506 * aiocbp / process is there. Returns TRUE if there is a match
2507 * on any of our aio queues.
2508 *
2509 * Called with proc aio lock held (can be held spin)
2510 */
2511 static boolean_t
2512 is_already_queued(proc_t procp,
2513 user_addr_t aiocbp )
2514 {
2515 aio_workq_entry *entryp;
2516 boolean_t result;
2517
2518 result = FALSE;
2519
2520 /* look for matches on our queue of async IO requests that have completed */
2521 TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
2522 if ( aiocbp == entryp->uaiocbp ) {
2523 result = TRUE;
2524 goto ExitThisRoutine;
2525 }
2526 }
2527
2528 /* look for matches on our queue of active async IO requests */
2529 TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
2530 if ( aiocbp == entryp->uaiocbp ) {
2531 result = TRUE;
2532 goto ExitThisRoutine;
2533 }
2534 }
2535
2536 ExitThisRoutine:
2537 return( result );
2538
2539 } /* is_already_queued */
2540
2541
2542 static void
2543 free_lio_context(aio_lio_context* context)
2544 {
2545
2546 #if DEBUG
2547 OSDecrementAtomic(&lio_contexts_alloced);
2548 #endif /* DEBUG */
2549
2550 FREE( context, M_TEMP );
2551
2552 } /* free_lio_context */
2553
2554
2555 /*
2556 * aio initialization
2557 */
2558 __private_extern__ void
2559 aio_init( void )
2560 {
2561 int i;
2562
2563 aio_lock_grp_attr = lck_grp_attr_alloc_init();
2564 aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2565 aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2566 aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
2567 aio_lock_attr = lck_attr_alloc_init();
2568
2569 lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2570 lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
2571
2572 aio_anchor.aio_inflight_count = 0;
2573 aio_anchor.aio_done_count = 0;
2574 aio_anchor.aio_total_count = 0;
2575 aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2576
2577 for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2578 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2579 }
2580
2581
2582 i = sizeof( aio_workq_entry );
2583 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2584
2585 _aio_create_worker_threads( aio_worker_threads );
2586
2587 } /* aio_init */
2588
2589
2590 /*
2591 * aio worker threads created here.
2592 */
2593 __private_extern__ void
2594 _aio_create_worker_threads( int num )
2595 {
2596 int i;
2597
2598 /* create some worker threads to handle the async IO requests */
2599 for ( i = 0; i < num; i++ ) {
2600 thread_t myThread;
2601
2602 if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
2603 printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2604 }
2605 else
2606 thread_deallocate(myThread);
2607 }
2608
2609 return;
2610
2611 } /* _aio_create_worker_threads */
2612
2613 /*
2614 * Return the current activation utask
2615 */
2616 task_t
2617 get_aiotask(void)
2618 {
2619 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2620 }
2621
2622
2623 /*
2624 * In the case of an aiocb from a
2625 * 32-bit process we need to expand some longs and pointers to the correct
2626 * sizes in order to let downstream code always work on the same type of
2627 * aiocb (in our case that is a user_aiocb)
2628 */
2629 static void
2630 do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2631 {
2632 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2633 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2634 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2635 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2636 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2637 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2638
2639 /* special case here. since we do not know if sigev_value is an */
2640 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2641 /* means if we send this info back to user space we need to remember */
2642 /* sigev_value was not expanded for the 32-bit case. */
2643 /* NOTE - this does NOT affect us since we don't support sigev_value */
2644 /* yet in the aio context. */
2645 //LP64
2646 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2647 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2648 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2649 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2650 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2651 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2652 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2653 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2654 }
2655
2656 /* Similar for 64-bit user process, so that we don't need to satisfy
2657 * the alignment constraints of the original user64_aiocb
2658 */
2659 static void
2660 do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2661 {
2662 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2663 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2664 the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2665 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2666 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2667 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2668
2669 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2670 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2671 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2672 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2673 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2674 my_aiocbp->aio_sigevent.sigev_notify_function;
2675 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2676 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2677 }