]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/i386/systemcalls.c
e8494ca4eb4b5c0742a8c4eab7d494b197bac71d
[apple/xnu.git] / bsd / dev / i386 / systemcalls.c
1 /*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <kern/task.h>
29 #include <kern/thread.h>
30 #include <kern/assert.h>
31 #include <kern/clock.h>
32 #include <kern/locks.h>
33 #include <kern/sched_prim.h>
34 #include <kern/debug.h>
35 #include <mach/machine/thread_status.h>
36 #include <mach/thread_act.h>
37 #include <mach/branch_predicates.h>
38
39 #include <sys/kernel.h>
40 #include <sys/vm.h>
41 #include <sys/proc_internal.h>
42 #include <sys/syscall.h>
43 #include <sys/systm.h>
44 #include <sys/user.h>
45 #include <sys/errno.h>
46 #include <sys/kdebug.h>
47 #include <sys/sysent.h>
48 #include <sys/sysproto.h>
49 #include <sys/kauth.h>
50 #include <sys/systm.h>
51
52 #include <security/audit/audit.h>
53
54 #include <i386/seg.h>
55 #include <i386/machine_routines.h>
56 #include <mach/i386/syscall_sw.h>
57
58 #include <machine/pal_routines.h>
59
60 #if CONFIG_DTRACE
61 extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
62 extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
63 #endif
64
65 extern void unix_syscall(x86_saved_state_t *);
66 extern void unix_syscall64(x86_saved_state_t *);
67 extern void *find_user_regs(thread_t);
68
69 extern void x86_toggle_sysenter_arg_store(thread_t thread, boolean_t valid);
70 extern boolean_t x86_sysenter_arg_store_isvalid(thread_t thread);
71
72 /* dynamically generated at build time based on syscalls.master */
73 extern const char *syscallnames[];
74
75 /*
76 * This needs to be a single switch so that it's "all on" or "all off",
77 * rather than being turned on for some code paths and not others, as this
78 * has a tendency to introduce "blame the next guy" bugs.
79 */
80 #if DEBUG
81 #define FUNNEL_DEBUG 1 /* Check for funnel held on exit */
82 #endif
83
84 /*
85 * Function: unix_syscall
86 *
87 * Inputs: regs - pointer to i386 save area
88 *
89 * Outputs: none
90 */
91 void
92 unix_syscall(x86_saved_state_t *state)
93 {
94 thread_t thread;
95 void *vt;
96 unsigned int code;
97 struct sysent *callp;
98
99 int error;
100 vm_offset_t params;
101 struct proc *p;
102 struct uthread *uthread;
103 x86_saved_state32_t *regs;
104 boolean_t args_in_uthread;
105 boolean_t is_vfork;
106
107 assert(is_saved_state32(state));
108 regs = saved_state32(state);
109 #if DEBUG
110 if (regs->eax == 0x800)
111 thread_exception_return();
112 #endif
113 thread = current_thread();
114 uthread = get_bsdthread_info(thread);
115
116 /* Get the approriate proc; may be different from task's for vfork() */
117 is_vfork = uthread->uu_flag & UT_VFORK;
118 if (__improbable(is_vfork != 0))
119 p = current_proc();
120 else
121 p = (struct proc *)get_bsdtask_info(current_task());
122
123 /* Verify that we are not being called from a task without a proc */
124 if (__improbable(p == NULL)) {
125 regs->eax = EPERM;
126 regs->efl |= EFL_CF;
127 task_terminate_internal(current_task());
128 thread_exception_return();
129 /* NOTREACHED */
130 }
131
132 code = regs->eax & I386_SYSCALL_NUMBER_MASK;
133 DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
134 code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
135 args_in_uthread = ((regs->eax & I386_SYSCALL_ARG_BYTES_MASK) != 0) && x86_sysenter_arg_store_isvalid(thread);
136 params = (vm_offset_t) (regs->uesp + sizeof (int));
137
138 regs->efl &= ~(EFL_CF);
139
140 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
141
142 if (__improbable(callp == sysent)) {
143 code = fuword(params);
144 params += sizeof(int);
145 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
146 }
147
148 vt = (void *)uthread->uu_arg;
149
150 if (callp->sy_arg_bytes != 0) {
151 sy_munge_t *mungerp;
152
153 assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg));
154 if (!args_in_uthread)
155 {
156 uint32_t nargs;
157 nargs = callp->sy_arg_bytes;
158 error = copyin((user_addr_t) params, (char *) vt, nargs);
159 if (error) {
160 regs->eax = error;
161 regs->efl |= EFL_CF;
162 thread_exception_return();
163 /* NOTREACHED */
164 }
165 }
166
167 if (__probable(code != 180)) {
168 int *ip = (int *)vt;
169
170 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
171 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
172 *ip, *(ip+1), *(ip+2), *(ip+3), 0);
173 }
174 mungerp = callp->sy_arg_munge32;
175
176 /*
177 * If non-NULL, then call the syscall argument munger to
178 * copy in arguments (see xnu/bsd/dev/{i386|x86_64}/munge.s); the
179 * first argument is NULL because we are munging in place
180 * after a copyin because the ABI currently doesn't use
181 * registers to pass system call arguments.
182 */
183 if (mungerp != NULL)
184 (*mungerp)(NULL, vt);
185 } else
186 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
187 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
188 0, 0, 0, 0, 0);
189
190 /*
191 * Delayed binding of thread credential to process credential, if we
192 * are not running with an explicitly set thread credential.
193 */
194 kauth_cred_uthread_update(uthread, p);
195
196 uthread->uu_rval[0] = 0;
197 uthread->uu_rval[1] = regs->edx;
198 uthread->uu_flag |= UT_NOTCANCELPT;
199
200
201 #ifdef JOE_DEBUG
202 uthread->uu_iocount = 0;
203 uthread->uu_vpindex = 0;
204 #endif
205
206 AUDIT_SYSCALL_ENTER(code, p, uthread);
207 error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
208 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
209
210 #ifdef JOE_DEBUG
211 if (uthread->uu_iocount)
212 printf("system call returned with uu_iocount != 0\n");
213 #endif
214 #if CONFIG_DTRACE
215 uthread->t_dtrace_errno = error;
216 #endif /* CONFIG_DTRACE */
217
218 if (__improbable(error == ERESTART)) {
219 /*
220 * Move the user's pc back to repeat the syscall:
221 * 5 bytes for a sysenter, or 2 for an int 8x.
222 * The SYSENTER_TF_CS covers single-stepping over a sysenter
223 * - see debug trap handler in idt.s/idt64.s
224 */
225
226 pal_syscall_restart(thread, state);
227 }
228 else if (error != EJUSTRETURN) {
229 if (__improbable(error)) {
230 regs->eax = error;
231 regs->efl |= EFL_CF; /* carry bit */
232 } else { /* (not error) */
233 regs->eax = uthread->uu_rval[0];
234 regs->edx = uthread->uu_rval[1];
235 }
236 }
237
238 DEBUG_KPRINT_SYSCALL_UNIX(
239 "unix_syscall: error=%d retval=(%u,%u)\n",
240 error, regs->eax, regs->edx);
241
242 uthread->uu_flag &= ~UT_NOTCANCELPT;
243 #if FUNNEL_DEBUG
244 /*
245 * if we're holding the funnel panic
246 */
247 syscall_exit_funnelcheck();
248 #endif /* FUNNEL_DEBUG */
249
250 if (__improbable(uthread->uu_lowpri_window)) {
251 /*
252 * task is marked as a low priority I/O type
253 * and the I/O we issued while in this system call
254 * collided with normal I/O operations... we'll
255 * delay in order to mitigate the impact of this
256 * task on the normal operation of the system
257 */
258 throttle_lowpri_io(TRUE);
259 }
260 if (__probable(code != 180))
261 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
262 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
263 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
264
265 if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) {
266 pal_execve_return(thread);
267 }
268
269 thread_exception_return();
270 /* NOTREACHED */
271 }
272
273
274 void
275 unix_syscall64(x86_saved_state_t *state)
276 {
277 thread_t thread;
278 unsigned int code;
279 struct sysent *callp;
280 void *uargp;
281 int args_in_regs;
282 int error;
283 struct proc *p;
284 struct uthread *uthread;
285 x86_saved_state64_t *regs;
286
287 assert(is_saved_state64(state));
288 regs = saved_state64(state);
289 #if DEBUG
290 if (regs->rax == 0x2000800)
291 thread_exception_return();
292 #endif
293 thread = current_thread();
294 uthread = get_bsdthread_info(thread);
295
296 /* Get the approriate proc; may be different from task's for vfork() */
297 if (__probable(!(uthread->uu_flag & UT_VFORK)))
298 p = (struct proc *)get_bsdtask_info(current_task());
299 else
300 p = current_proc();
301
302 /* Verify that we are not being called from a task without a proc */
303 if (__improbable(p == NULL)) {
304 regs->rax = EPERM;
305 regs->isf.rflags |= EFL_CF;
306 task_terminate_internal(current_task());
307 thread_exception_return();
308 /* NOTREACHED */
309 }
310 args_in_regs = 6;
311
312 code = regs->rax & SYSCALL_NUMBER_MASK;
313 DEBUG_KPRINT_SYSCALL_UNIX(
314 "unix_syscall64: code=%d(%s) rip=%llx\n",
315 code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
316 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
317 uargp = (void *)(&regs->rdi);
318
319 if (__improbable(callp == sysent)) {
320 /*
321 * indirect system call... system call number
322 * passed as 'arg0'
323 */
324 code = regs->rdi;
325 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
326 uargp = (void *)(&regs->rsi);
327 args_in_regs = 5;
328 }
329
330 if (callp->sy_narg != 0) {
331 if (code != 180) {
332 uint64_t *ip = (uint64_t *)uargp;
333
334 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
335 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
336 (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
337 }
338 assert(callp->sy_narg <= 8);
339
340 if (__improbable(callp->sy_narg > args_in_regs)) {
341 int copyin_count;
342
343 copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t);
344
345 error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&regs->v_arg6, copyin_count);
346 if (error) {
347 regs->rax = error;
348 regs->isf.rflags |= EFL_CF;
349 thread_exception_return();
350 /* NOTREACHED */
351 }
352 }
353 /*
354 * XXX Turn 64 bit unsafe calls into nosys()
355 */
356 if (__improbable(callp->sy_flags & UNSAFE_64BIT)) {
357 callp = &sysent[63];
358 goto unsafe;
359 }
360 } else
361 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
362 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
363 0, 0, 0, 0, 0);
364 unsafe:
365
366 /*
367 * Delayed binding of thread credential to process credential, if we
368 * are not running with an explicitly set thread credential.
369 */
370 kauth_cred_uthread_update(uthread, p);
371
372 uthread->uu_rval[0] = 0;
373 uthread->uu_rval[1] = 0;
374
375
376 uthread->uu_flag |= UT_NOTCANCELPT;
377
378 #ifdef JOE_DEBUG
379 uthread->uu_iocount = 0;
380 uthread->uu_vpindex = 0;
381 #endif
382
383 AUDIT_SYSCALL_ENTER(code, p, uthread);
384 error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0]));
385 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
386
387 #ifdef JOE_DEBUG
388 if (uthread->uu_iocount)
389 printf("system call returned with uu_iocount != 0\n");
390 #endif
391
392 #if CONFIG_DTRACE
393 uthread->t_dtrace_errno = error;
394 #endif /* CONFIG_DTRACE */
395
396 if (__improbable(error == ERESTART)) {
397 /*
398 * all system calls come through via the syscall instruction
399 * in 64 bit mode... its 2 bytes in length
400 * move the user's pc back to repeat the syscall:
401 */
402 pal_syscall_restart( thread, state );
403 }
404 else if (error != EJUSTRETURN) {
405 if (__improbable(error)) {
406 regs->rax = error;
407 regs->isf.rflags |= EFL_CF; /* carry bit */
408 } else { /* (not error) */
409
410 switch (callp->sy_return_type) {
411 case _SYSCALL_RET_INT_T:
412 regs->rax = uthread->uu_rval[0];
413 regs->rdx = uthread->uu_rval[1];
414 break;
415 case _SYSCALL_RET_UINT_T:
416 regs->rax = ((u_int)uthread->uu_rval[0]);
417 regs->rdx = ((u_int)uthread->uu_rval[1]);
418 break;
419 case _SYSCALL_RET_OFF_T:
420 case _SYSCALL_RET_ADDR_T:
421 case _SYSCALL_RET_SIZE_T:
422 case _SYSCALL_RET_SSIZE_T:
423 case _SYSCALL_RET_UINT64_T:
424 regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
425 regs->rdx = 0;
426 break;
427 case _SYSCALL_RET_NONE:
428 break;
429 default:
430 panic("unix_syscall: unknown return type");
431 break;
432 }
433 regs->isf.rflags &= ~EFL_CF;
434 }
435 }
436
437 DEBUG_KPRINT_SYSCALL_UNIX(
438 "unix_syscall64: error=%d retval=(%llu,%llu)\n",
439 error, regs->rax, regs->rdx);
440
441 uthread->uu_flag &= ~UT_NOTCANCELPT;
442
443 #if FUNNEL_DEBUG
444 /*
445 * if we're holding the funnel panic
446 */
447 syscall_exit_funnelcheck();
448 #endif /* FUNNEL_DEBUG */
449
450 if (__improbable(uthread->uu_lowpri_window)) {
451 /*
452 * task is marked as a low priority I/O type
453 * and the I/O we issued while in this system call
454 * collided with normal I/O operations... we'll
455 * delay in order to mitigate the impact of this
456 * task on the normal operation of the system
457 */
458 throttle_lowpri_io(TRUE);
459 }
460 if (__probable(code != 180))
461 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
462 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
463 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
464
465 thread_exception_return();
466 /* NOTREACHED */
467 }
468
469
470 void
471 unix_syscall_return(int error)
472 {
473 thread_t thread;
474 struct uthread *uthread;
475 struct proc *p;
476 unsigned int code;
477 vm_offset_t params;
478 struct sysent *callp;
479
480 thread = current_thread();
481 uthread = get_bsdthread_info(thread);
482
483 pal_register_cache_state(thread, DIRTY);
484
485 p = current_proc();
486
487 if (proc_is64bit(p)) {
488 x86_saved_state64_t *regs;
489
490 regs = saved_state64(find_user_regs(thread));
491
492 /* reconstruct code for tracing before blasting rax */
493 code = regs->rax & SYSCALL_NUMBER_MASK;
494 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
495
496 if (callp == sysent)
497 /*
498 * indirect system call... system call number
499 * passed as 'arg0'
500 */
501 code = regs->rdi;
502
503 #if CONFIG_DTRACE
504 if (callp->sy_call == dtrace_systrace_syscall)
505 dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
506 #endif /* CONFIG_DTRACE */
507 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
508
509 if (error == ERESTART) {
510 /*
511 * repeat the syscall
512 */
513 pal_syscall_restart( thread, find_user_regs(thread) );
514 }
515 else if (error != EJUSTRETURN) {
516 if (error) {
517 regs->rax = error;
518 regs->isf.rflags |= EFL_CF; /* carry bit */
519 } else { /* (not error) */
520
521 switch (callp->sy_return_type) {
522 case _SYSCALL_RET_INT_T:
523 regs->rax = uthread->uu_rval[0];
524 regs->rdx = uthread->uu_rval[1];
525 break;
526 case _SYSCALL_RET_UINT_T:
527 regs->rax = ((u_int)uthread->uu_rval[0]);
528 regs->rdx = ((u_int)uthread->uu_rval[1]);
529 break;
530 case _SYSCALL_RET_OFF_T:
531 case _SYSCALL_RET_ADDR_T:
532 case _SYSCALL_RET_SIZE_T:
533 case _SYSCALL_RET_SSIZE_T:
534 case _SYSCALL_RET_UINT64_T:
535 regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
536 regs->rdx = 0;
537 break;
538 case _SYSCALL_RET_NONE:
539 break;
540 default:
541 panic("unix_syscall: unknown return type");
542 break;
543 }
544 regs->isf.rflags &= ~EFL_CF;
545 }
546 }
547 DEBUG_KPRINT_SYSCALL_UNIX(
548 "unix_syscall_return: error=%d retval=(%llu,%llu)\n",
549 error, regs->rax, regs->rdx);
550 } else {
551 x86_saved_state32_t *regs;
552
553 regs = saved_state32(find_user_regs(thread));
554
555 regs->efl &= ~(EFL_CF);
556 /* reconstruct code for tracing before blasting eax */
557 code = regs->eax & I386_SYSCALL_NUMBER_MASK;
558 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
559
560 #if CONFIG_DTRACE
561 if (callp->sy_call == dtrace_systrace_syscall)
562 dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
563 #endif /* CONFIG_DTRACE */
564 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
565
566 if (callp == sysent) {
567 params = (vm_offset_t) (regs->uesp + sizeof (int));
568 code = fuword(params);
569 }
570 if (error == ERESTART) {
571 pal_syscall_restart( thread, find_user_regs(thread) );
572 }
573 else if (error != EJUSTRETURN) {
574 if (error) {
575 regs->eax = error;
576 regs->efl |= EFL_CF; /* carry bit */
577 } else { /* (not error) */
578 regs->eax = uthread->uu_rval[0];
579 regs->edx = uthread->uu_rval[1];
580 }
581 }
582 DEBUG_KPRINT_SYSCALL_UNIX(
583 "unix_syscall_return: error=%d retval=(%u,%u)\n",
584 error, regs->eax, regs->edx);
585 }
586
587
588 uthread->uu_flag &= ~UT_NOTCANCELPT;
589
590 #if FUNNEL_DEBUG
591 /*
592 * if we're holding the funnel panic
593 */
594 syscall_exit_funnelcheck();
595 #endif /* FUNNEL_DEBUG */
596
597 if (uthread->uu_lowpri_window) {
598 /*
599 * task is marked as a low priority I/O type
600 * and the I/O we issued while in this system call
601 * collided with normal I/O operations... we'll
602 * delay in order to mitigate the impact of this
603 * task on the normal operation of the system
604 */
605 throttle_lowpri_io(TRUE);
606 }
607 if (code != 180)
608 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
609 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
610 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
611
612 thread_exception_return();
613 /* NOTREACHED */
614 }
615
616 void
617 munge_wwwlww(
618 __unused const void *in32,
619 void *out64)
620 {
621 uint32_t *arg32;
622 uint64_t *arg64;
623
624 /* we convert in place in out64 */
625 arg32 = (uint32_t *) out64;
626 arg64 = (uint64_t *) out64;
627
628 arg64[5] = arg32[6]; /* wwwlwW */
629 arg64[4] = arg32[5]; /* wwwlWw */
630 arg32[7] = arg32[4]; /* wwwLww (hi) */
631 arg32[6] = arg32[3]; /* wwwLww (lo) */
632 arg64[2] = arg32[2]; /* wwWlww */
633 arg64[1] = arg32[1]; /* wWwlww */
634 arg64[0] = arg32[0]; /* Wwwlww */
635 }
636
637
638 void
639 munge_wwlwww(
640 __unused const void *in32,
641 void *out64)
642 {
643 uint32_t *arg32;
644 uint64_t *arg64;
645
646 /* we convert in place in out64 */
647 arg32 = (uint32_t *) out64;
648 arg64 = (uint64_t *) out64;
649
650 arg64[5] = arg32[6]; /* wwlwwW */
651 arg64[4] = arg32[5]; /* wwlwWw */
652 arg64[3] = arg32[4]; /* wwlWww */
653 arg32[5] = arg32[3]; /* wwLwww (hi) */
654 arg32[4] = arg32[2]; /* wwLwww (lo) */
655 arg64[1] = arg32[1]; /* wWlwww */
656 arg64[0] = arg32[0]; /* Wwlwww */
657 }
658