/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <kern/clock.h>
#include <kern/locks.h>
#include <kern/sched_prim.h>
+#include <kern/debug.h>
#include <mach/machine/thread_status.h>
#include <mach/thread_act.h>
+#include <mach/branch_predicates.h>
#include <sys/kernel.h>
#include <sys/vm.h>
#include <sys/kauth.h>
#include <sys/systm.h>
-#include <bsm/audit_kernel.h>
+#include <security/audit/audit.h>
#include <i386/seg.h>
#include <i386/machine_routines.h>
#include <mach/i386/syscall_sw.h>
+#include <machine/pal_routines.h>
+
#if CONFIG_DTRACE
extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
extern void unix_syscall(x86_saved_state_t *);
extern void unix_syscall64(x86_saved_state_t *);
extern void *find_user_regs(thread_t);
-extern void throttle_lowpri_io(int *lowpri_window, mount_t v_mount);
-extern void x86_toggle_sysenter_arg_store(thread_t thread, boolean_t valid);
-extern boolean_t x86_sysenter_arg_store_isvalid(thread_t thread);
+/* dynamically generated at build time based on syscalls.master */
+extern const char *syscallnames[];
+
+/*
+ * This needs to be a single switch so that it's "all on" or "all off",
+ * rather than being turned on for some code paths and not others, as this
+ * has a tendency to introduce "blame the next guy" bugs.
+ */
+#if DEBUG
+#define FUNNEL_DEBUG 1 /* Check for funnel held on exit */
+#endif
+
/*
* Function: unix_syscall
*
struct proc *p;
struct uthread *uthread;
x86_saved_state32_t *regs;
- boolean_t args_in_uthread;
+ boolean_t is_vfork;
assert(is_saved_state32(state));
regs = saved_state32(state);
uthread = get_bsdthread_info(thread);
/* Get the approriate proc; may be different from task's for vfork() */
- if (!(uthread->uu_flag & UT_VFORK))
- p = (struct proc *)get_bsdtask_info(current_task());
- else
+ is_vfork = uthread->uu_flag & UT_VFORK;
+ if (__improbable(is_vfork != 0))
p = current_proc();
+ else
+ p = (struct proc *)get_bsdtask_info(current_task());
/* Verify that we are not being called from a task without a proc */
- if (p == NULL) {
+ if (__improbable(p == NULL)) {
regs->eax = EPERM;
regs->efl |= EFL_CF;
task_terminate_internal(current_task());
}
code = regs->eax & I386_SYSCALL_NUMBER_MASK;
- args_in_uthread = ((regs->eax & I386_SYSCALL_ARG_BYTES_MASK) != 0) && x86_sysenter_arg_store_isvalid(thread);
- params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int));
+ DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
+ code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
+ params = (vm_offset_t) (regs->uesp + sizeof (int));
regs->efl &= ~(EFL_CF);
callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
- if (callp == sysent) {
+ if (__improbable(callp == sysent)) {
code = fuword(params);
params += sizeof(int);
callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
}
vt = (void *)uthread->uu_arg;
+ uthread->uu_ap = vt;
if (callp->sy_arg_bytes != 0) {
sy_munge_t *mungerp;
+ uint32_t nargs;
assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg));
- if (!args_in_uthread)
- {
- uint32_t nargs;
- nargs = callp->sy_arg_bytes;
- error = copyin((user_addr_t) params, (char *) vt, nargs);
- if (error) {
- regs->eax = error;
- regs->efl |= EFL_CF;
- thread_exception_return();
- /* NOTREACHED */
- }
+ nargs = callp->sy_arg_bytes;
+ error = copyin((user_addr_t) params, (char *) vt, nargs);
+ if (error) {
+ regs->eax = error;
+ regs->efl |= EFL_CF;
+ thread_exception_return();
+ /* NOTREACHED */
}
- if (code != 180) {
+ if (__probable(code != 180)) {
int *ip = (int *)vt;
- KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
- *ip, *(ip+1), *(ip+2), *(ip+3), 0);
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+ *ip, *(ip+1), *(ip+2), *(ip+3), 0);
}
mungerp = callp->sy_arg_munge32;
/*
* If non-NULL, then call the syscall argument munger to
- * copy in arguments (see xnu/bsd/dev/i386/munge.s); the
+ * copy in arguments (see xnu/bsd/dev/{i386|x86_64}/munge.s); the
* first argument is NULL because we are munging in place
* after a copyin because the ABI currently doesn't use
* registers to pass system call arguments.
if (mungerp != NULL)
(*mungerp)(NULL, vt);
} else
- KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
0, 0, 0, 0, 0);
/*
#ifdef JOE_DEBUG
if (uthread->uu_iocount)
- joe_debug("system call returned with uu_iocount != 0");
+ printf("system call returned with uu_iocount != 0\n");
#endif
#if CONFIG_DTRACE
uthread->t_dtrace_errno = error;
#endif /* CONFIG_DTRACE */
- if (error == ERESTART) {
+ if (__improbable(error == ERESTART)) {
/*
* Move the user's pc back to repeat the syscall:
* 5 bytes for a sysenter, or 2 for an int 8x.
* The SYSENTER_TF_CS covers single-stepping over a sysenter
* - see debug trap handler in idt.s/idt64.s
*/
- if (regs->cs == SYSENTER_CS || regs->cs == SYSENTER_TF_CS) {
- regs->eip -= 5;
- }
- else
- regs->eip -= 2;
+
+ pal_syscall_restart(thread, state);
}
else if (error != EJUSTRETURN) {
- if (error) {
+ if (__improbable(error)) {
regs->eax = error;
regs->efl |= EFL_CF; /* carry bit */
} else { /* (not error) */
}
}
+ DEBUG_KPRINT_SYSCALL_UNIX(
+ "unix_syscall: error=%d retval=(%u,%u)\n",
+ error, regs->eax, regs->edx);
+
uthread->uu_flag &= ~UT_NOTCANCELPT;
-#if DEBUG
+#if FUNNEL_DEBUG
/*
* if we're holding the funnel panic
*/
syscall_exit_funnelcheck();
-#endif /* DEBUG */
- if (uthread->uu_lowpri_window && uthread->v_mount) {
+#endif /* FUNNEL_DEBUG */
+
+ if (__improbable(uthread->uu_lowpri_window)) {
/*
* task is marked as a low priority I/O type
* and the I/O we issued while in this system call
* delay in order to mitigate the impact of this
* task on the normal operation of the system
*/
- throttle_lowpri_io(&uthread->uu_lowpri_window,uthread->v_mount);
+ throttle_lowpri_io(1);
+ }
+ if (__probable(code != 180))
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+ error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
+
+ if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) {
+ pal_execve_return(thread);
}
- if (code != 180)
- KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
- error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0);
thread_exception_return();
/* NOTREACHED */
assert(is_saved_state64(state));
regs = saved_state64(state);
-
+#if DEBUG
if (regs->rax == 0x2000800)
thread_exception_return();
-
+#endif
thread = current_thread();
uthread = get_bsdthread_info(thread);
/* Get the approriate proc; may be different from task's for vfork() */
- if (!(uthread->uu_flag & UT_VFORK))
+ if (__probable(!(uthread->uu_flag & UT_VFORK)))
p = (struct proc *)get_bsdtask_info(current_task());
else
p = current_proc();
/* Verify that we are not being called from a task without a proc */
- if (p == NULL) {
+ if (__improbable(p == NULL)) {
regs->rax = EPERM;
regs->isf.rflags |= EFL_CF;
task_terminate_internal(current_task());
args_in_regs = 6;
code = regs->rax & SYSCALL_NUMBER_MASK;
+ DEBUG_KPRINT_SYSCALL_UNIX(
+ "unix_syscall64: code=%d(%s) rip=%llx\n",
+ code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
uargp = (void *)(®s->rdi);
- if (callp == sysent) {
+ if (__improbable(callp == sysent)) {
/*
* indirect system call... system call number
* passed as 'arg0'
uargp = (void *)(®s->rsi);
args_in_regs = 5;
}
+ uthread->uu_ap = uargp;
if (callp->sy_narg != 0) {
if (code != 180) {
uint64_t *ip = (uint64_t *)uargp;
- KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
- (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+ (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
}
assert(callp->sy_narg <= 8);
- if (callp->sy_narg > args_in_regs) {
+ if (__improbable(callp->sy_narg > args_in_regs)) {
int copyin_count;
copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t);
/* NOTREACHED */
}
}
- /*
- * XXX Turn 64 bit unsafe calls into nosys()
- */
- if (callp->sy_flags & UNSAFE_64BIT) {
- callp = &sysent[63];
- goto unsafe;
- }
} else
- KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
- 0, 0, 0, 0, 0);
-unsafe:
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+ 0, 0, 0, 0, 0);
/*
* Delayed binding of thread credential to process credential, if we
uthread->uu_flag |= UT_NOTCANCELPT;
+#ifdef JOE_DEBUG
+ uthread->uu_iocount = 0;
+ uthread->uu_vpindex = 0;
+#endif
AUDIT_SYSCALL_ENTER(code, p, uthread);
error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0]));
AUDIT_SYSCALL_EXIT(code, p, uthread, error);
+#ifdef JOE_DEBUG
+ if (uthread->uu_iocount)
+ printf("system call returned with uu_iocount != 0\n");
+#endif
+
#if CONFIG_DTRACE
uthread->t_dtrace_errno = error;
#endif /* CONFIG_DTRACE */
- if (error == ERESTART) {
+ if (__improbable(error == ERESTART)) {
/*
* all system calls come through via the syscall instruction
* in 64 bit mode... its 2 bytes in length
* move the user's pc back to repeat the syscall:
*/
- regs->isf.rip -= 2;
+ pal_syscall_restart( thread, state );
}
else if (error != EJUSTRETURN) {
- if (error) {
+ if (__improbable(error)) {
regs->rax = error;
regs->isf.rflags |= EFL_CF; /* carry bit */
} else { /* (not error) */
case _SYSCALL_RET_ADDR_T:
case _SYSCALL_RET_SIZE_T:
case _SYSCALL_RET_SSIZE_T:
+ case _SYSCALL_RET_UINT64_T:
regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
regs->rdx = 0;
break;
}
}
-
+ DEBUG_KPRINT_SYSCALL_UNIX(
+ "unix_syscall64: error=%d retval=(%llu,%llu)\n",
+ error, regs->rax, regs->rdx);
+
uthread->uu_flag &= ~UT_NOTCANCELPT;
+#if FUNNEL_DEBUG
/*
* if we're holding the funnel panic
*/
syscall_exit_funnelcheck();
+#endif /* FUNNEL_DEBUG */
- if (uthread->uu_lowpri_window && uthread->v_mount) {
+ if (__improbable(uthread->uu_lowpri_window)) {
/*
* task is marked as a low priority I/O type
* and the I/O we issued while in this system call
* delay in order to mitigate the impact of this
* task on the normal operation of the system
*/
- throttle_lowpri_io(&uthread->uu_lowpri_window,uthread->v_mount);
+ throttle_lowpri_io(1);
}
- if (code != 180)
- KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
- error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0);
+ if (__probable(code != 180))
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+ error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
thread_exception_return();
/* NOTREACHED */
thread = current_thread();
uthread = get_bsdthread_info(thread);
+ pal_register_cache_state(thread, DIRTY);
+
p = current_proc();
if (proc_is64bit(p)) {
if (callp->sy_call == dtrace_systrace_syscall)
dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
#endif /* CONFIG_DTRACE */
+ AUDIT_SYSCALL_EXIT(code, p, uthread, error);
if (error == ERESTART) {
/*
- * all system calls come through via the syscall instruction
- * in 64 bit mode... its 2 bytes in length
- * move the user's pc back to repeat the syscall:
+ * repeat the syscall
*/
- regs->isf.rip -= 2;
+ pal_syscall_restart( thread, find_user_regs(thread) );
}
else if (error != EJUSTRETURN) {
if (error) {
case _SYSCALL_RET_ADDR_T:
case _SYSCALL_RET_SIZE_T:
case _SYSCALL_RET_SSIZE_T:
+ case _SYSCALL_RET_UINT64_T:
regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
regs->rdx = 0;
break;
regs->isf.rflags &= ~EFL_CF;
}
}
+ DEBUG_KPRINT_SYSCALL_UNIX(
+ "unix_syscall_return: error=%d retval=(%llu,%llu)\n",
+ error, regs->rax, regs->rdx);
} else {
x86_saved_state32_t *regs;
if (callp->sy_call == dtrace_systrace_syscall)
dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
#endif /* CONFIG_DTRACE */
+ AUDIT_SYSCALL_EXIT(code, p, uthread, error);
if (callp == sysent) {
- params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int));
+ params = (vm_offset_t) (regs->uesp + sizeof (int));
code = fuword(params);
}
if (error == ERESTART) {
- regs->eip -= ((regs->cs & 0xffff) == SYSENTER_CS) ? 5 : 2;
+ pal_syscall_restart( thread, find_user_regs(thread) );
}
else if (error != EJUSTRETURN) {
if (error) {
regs->edx = uthread->uu_rval[1];
}
}
+ DEBUG_KPRINT_SYSCALL_UNIX(
+ "unix_syscall_return: error=%d retval=(%u,%u)\n",
+ error, regs->eax, regs->edx);
}
uthread->uu_flag &= ~UT_NOTCANCELPT;
+#if FUNNEL_DEBUG
/*
* if we're holding the funnel panic
*/
syscall_exit_funnelcheck();
+#endif /* FUNNEL_DEBUG */
- if (uthread->uu_lowpri_window && uthread->v_mount) {
+ if (uthread->uu_lowpri_window) {
/*
* task is marked as a low priority I/O type
* and the I/O we issued while in this system call
* delay in order to mitigate the impact of this
* task on the normal operation of the system
*/
- throttle_lowpri_io(&uthread->uu_lowpri_window,uthread->v_mount);
+ throttle_lowpri_io(1);
}
if (code != 180)
- KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
- error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0);
+ KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+ BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+ error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
thread_exception_return();
/* NOTREACHED */
}
-void
-munge_wwwlww(
- __unused const void *in32,
- void *out64)
-{
- uint32_t *arg32;
- uint64_t *arg64;
-
- /* we convert in place in out64 */
- arg32 = (uint32_t *) out64;
- arg64 = (uint64_t *) out64;
-
- arg64[5] = arg32[6]; /* wwwlwW */
- arg64[4] = arg32[5]; /* wwwlWw */
- arg32[7] = arg32[4]; /* wwwLww (hi) */
- arg32[6] = arg32[3]; /* wwwLww (lo) */
- arg64[2] = arg32[2]; /* wwWlww */
- arg64[1] = arg32[1]; /* wWwlww */
- arg64[0] = arg32[0]; /* Wwwlww */
-}
-
-
-void
-munge_wwlwww(
- __unused const void *in32,
- void *out64)
-{
- uint32_t *arg32;
- uint64_t *arg64;
-
- /* we convert in place in out64 */
- arg32 = (uint32_t *) out64;
- arg64 = (uint64_t *) out64;
-
- arg64[5] = arg32[6]; /* wwlwwW */
- arg64[4] = arg32[5]; /* wwlwWw */
- arg64[3] = arg32[4]; /* wwlWww */
- arg32[5] = arg32[3]; /* wwLwww (hi) */
- arg32[4] = arg32[2]; /* wwLwww (lo) */
- arg64[1] = arg32[1]; /* wWlwww */
- arg64[0] = arg32[0]; /* Wwlwww */
-}
-
-#ifdef JOE_DEBUG
-joe_debug(char *p) {
-
- printf("%s\n", p);
-}
-#endif
-
-