X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/c0fea4742e91338fffdcf79f86a7c1d5e2b97eb1..HEAD:/bsd/dev/i386/systemcalls.c diff --git a/bsd/dev/i386/systemcalls.c b/bsd/dev/i386/systemcalls.c index 05c0b70b0..8a12ad5a3 100644 --- a/bsd/dev/i386/systemcalls.c +++ b/bsd/dev/i386/systemcalls.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include #include @@ -25,6 +31,7 @@ #include #include #include +#include #include #include @@ -35,27 +42,40 @@ #include #include #include -#include #include #include #include #include #include +#include -#include +#include #include #include #include +#include + +#if CONFIG_MACF +#include +#endif + +#if CONFIG_DTRACE +extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); +extern void dtrace_systrace_syscall_return(unsigned short, int, int *); +#endif + extern void unix_syscall(x86_saved_state_t *); extern void unix_syscall64(x86_saved_state_t *); -extern void unix_syscall_return(int); extern void *find_user_regs(thread_t); -extern void IOSleep(int); -extern void exit_funnel_section(void); -extern void Debugger(const char * message); +/* dynamically generated at build time based on syscalls.master */ +extern const char *syscallnames[]; + +#define code_is_kdebug_trace(code) (((code) == SYS_kdebug_trace) || \ + ((code) == SYS_kdebug_trace64) || \ + ((code) == SYS_kdebug_trace_string)) /* * Function: unix_syscall @@ -64,64 +84,72 @@ extern void Debugger(const char * message); * * Outputs: none */ +__attribute__((noreturn)) void unix_syscall(x86_saved_state_t *state) { - thread_t thread; - void *vt; - unsigned short code; - struct sysent *callp; - int nargs; - int error; - int funnel_type; - vm_offset_t params; - struct proc *p; - struct uthread *uthread; - unsigned int cancel_enable; - x86_saved_state32_t *regs; + thread_t thread; + void *vt; + unsigned int code, syscode; + const struct sysent *callp; + + int error; + vm_offset_t params; + struct proc *p; + struct uthread *uthread; + x86_saved_state32_t *regs; + boolean_t is_vfork; + pid_t pid; assert(is_saved_state32(state)); regs = saved_state32(state); - - if (regs->eax == 0x800) +#if DEBUG + if (regs->eax == 0x800) { thread_exception_return(); - + } +#endif thread = current_thread(); uthread = get_bsdthread_info(thread); + uthread_reset_proc_refcount(uthread); + /* Get the approriate proc; may be different from task's for vfork() */ - if (!(uthread->uu_flag & UT_VFORK)) - p = (struct proc *)get_bsdtask_info(current_task()); - else + is_vfork = uthread->uu_flag & UT_VFORK; + if (__improbable(is_vfork != 0)) { p = current_proc(); - - /* Verify that we are not being called from a task without a proc */ - if (p == NULL) { - regs->eax = EPERM; - regs->efl |= EFL_CF; - task_terminate_internal(current_task()); - thread_exception_return(); - /* NOTREACHED */ + } else { + p = (struct proc *)get_bsdtask_info(current_task()); } - //printf("[scall : eax %x]", regs->eax); - code = regs->eax; - params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int)); - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; + code = regs->eax & I386_SYSCALL_NUMBER_MASK; + syscode = (code < nsysent) ? code : SYS_invalid; + DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n", + code, syscallnames[syscode], (uint32_t)regs->eip); + params = (vm_offset_t) (regs->uesp + sizeof(int)); + + regs->efl &= ~(EFL_CF); + + callp = &sysent[syscode]; - if (callp == sysent) { + if (__improbable(callp == sysent)) { code = fuword(params); - params += sizeof (int); - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; + params += sizeof(int); + syscode = (code < nsysent) ? code : SYS_invalid; + callp = &sysent[syscode]; } - vt = (void *)uthread->uu_arg; - nargs = callp->sy_narg * sizeof (syscall_arg_t); - if (nargs != 0) { - sy_munge_t *mungerp; + vt = (void *)uthread->uu_arg; - assert(nargs <= 8); + if (callp->sy_arg_bytes != 0) { +#if CONFIG_REQUIRES_U32_MUNGING + sy_munge_t *mungerp; +#else +#error U32 syscalls on x86_64 kernel requires munging +#endif + uint32_t nargs; + assert((unsigned) callp->sy_arg_bytes <= sizeof(uthread->uu_arg)); + nargs = callp->sy_arg_bytes; error = copyin((user_addr_t) params, (char *) vt, nargs); if (error) { regs->eax = error; @@ -129,280 +157,287 @@ unix_syscall(x86_saved_state_t *state) thread_exception_return(); /* NOTREACHED */ } - if (code != 180) { - int *ip = (int *)vt; - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - *ip, *(ip+1), *(ip+2), *(ip+3), 0); + if (__probable(!code_is_kdebug_trace(code))) { + uint32_t *uip = vt; + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + uip[0], uip[1], uip[2], uip[3]); } + +#if CONFIG_REQUIRES_U32_MUNGING mungerp = callp->sy_arg_munge32; - /* - * If non-NULL, then call the syscall argument munger to - * copy in arguments (see xnu/bsd/dev/i386/munge.s); the - * first argument is NULL because we are munging in place - * after a copyin because the ABI currently doesn't use - * registers to pass system call arguments. - */ - if (mungerp != NULL) - (*mungerp)(NULL, vt); - } else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - 0, 0, 0, 0, 0); + if (mungerp != NULL) { + (*mungerp)(vt); + } +#endif + } else { + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START); + } + /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ - if (uthread->uu_ucred != p->p_ucred && - (uthread->uu_flag & UT_SETUID) == 0) { - kauth_cred_t old = uthread->uu_ucred; - proc_lock(p); - uthread->uu_ucred = p->p_ucred; - kauth_cred_ref(uthread->uu_ucred); - proc_unlock(p); - if (old != NOCRED) - kauth_cred_rele(old); - } + kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; - uthread->uu_rval[1] = regs->edx; - - cancel_enable = callp->sy_cancel; - - if (cancel_enable == _SYSCALL_CANCEL_NONE) { - uthread->uu_flag |= UT_NOTCANCELPT; - } else { - if ((uthread->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) { - if (cancel_enable == _SYSCALL_CANCEL_PRE) { - /* system call cancelled; return to handle cancellation */ - regs->eax = (long long)EINTR; - regs->efl |= EFL_CF; - thread_exception_return(); - /* NOTREACHED */ - } else { - thread_abort_safely(thread); - } + uthread->uu_rval[1] = 0; + uthread->uu_flag |= UT_NOTCANCELPT; + uthread->syscall_code = code; + pid = proc_pid(p); + +#ifdef JOE_DEBUG + uthread->uu_iocount = 0; + uthread->uu_vpindex = 0; +#endif + +#if CONFIG_MACF + if (__improbable(p->syscall_filter_mask != NULL && !bitstr_test(p->syscall_filter_mask, syscode))) { + error = mac_proc_check_syscall_unix(p, syscode); + if (error) { + goto skip_syscall; } } - - funnel_type = (callp->sy_funnel & FUNNEL_MASK); - if (funnel_type == KERNEL_FUNNEL) - thread_funnel_set(kernel_flock, TRUE); - - if (KTRPOINT(p, KTR_SYSCALL)) - ktrsyscall(p, code, callp->sy_narg, vt); +#endif /* CONFIG_MACF */ AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); - AUDIT_SYSCALL_EXIT(error, p, uthread); - - if (error == ERESTART) { + AUDIT_SYSCALL_EXIT(code, p, uthread, error); + +#if CONFIG_MACF +skip_syscall: +#endif /* CONFIG_MACF */ + +#ifdef JOE_DEBUG + if (uthread->uu_iocount) { + printf("system call returned with uu_iocount != 0\n"); + } +#endif +#if CONFIG_DTRACE + uthread->t_dtrace_errno = error; +#endif /* CONFIG_DTRACE */ + + if (__improbable(error == ERESTART)) { /* * Move the user's pc back to repeat the syscall: * 5 bytes for a sysenter, or 2 for an int 8x. * The SYSENTER_TF_CS covers single-stepping over a sysenter * - see debug trap handler in idt.s/idt64.s */ - if (regs->cs == SYSENTER_CS || regs->cs == SYSENTER_TF_CS) - regs->eip -= 5; - else - regs->eip -= 2; - } - else if (error != EJUSTRETURN) { - if (error) { - regs->eax = error; - regs->efl |= EFL_CF; /* carry bit */ + + pal_syscall_restart(thread, state); + } else if (error != EJUSTRETURN) { + if (__improbable(error)) { + regs->eax = error; + regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ - regs->eax = uthread->uu_rval[0]; - regs->edx = uthread->uu_rval[1]; - regs->efl &= ~EFL_CF; - } + /* + * We split retval across two registers, in case the + * syscall had a 64-bit return value, in which case + * eax/edx matches the function call ABI. + */ + regs->eax = uthread->uu_rval[0]; + regs->edx = uthread->uu_rval[1]; + } } - if (KTRPOINT(p, KTR_SYSRET)) - ktrsysret(p, code, error, uthread->uu_rval[0]); + DEBUG_KPRINT_SYSCALL_UNIX( + "unix_syscall: error=%d retval=(%u,%u)\n", + error, regs->eax, regs->edx); - if (cancel_enable == _SYSCALL_CANCEL_NONE) - uthread->uu_flag &= ~UT_NOTCANCELPT; + uthread->uu_flag &= ~UT_NOTCANCELPT; + uthread->syscall_code = 0; - /* - * if we're holding the funnel - * than drop it regardless of whether - * we took it on system call entry - */ - exit_funnel_section(); +#if DEBUG || DEVELOPMENT + kern_allocation_name_t + prior __assert_only = thread_set_allocation_name(NULL); + assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior)); +#endif /* DEBUG || DEVELOPMENT */ - if (uthread->uu_lowpri_delay) { - /* + if (__improbable(uthread->uu_lowpri_window)) { + /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ - IOSleep(uthread->uu_lowpri_delay); - uthread->uu_lowpri_delay = 0; + throttle_lowpri_io(1); + } + if (__probable(!code_is_kdebug_trace(code))) { + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], pid); + } + + if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { + pal_execve_return(thread); } - if (code != 180) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0); + +#if PROC_REF_DEBUG + if (__improbable(uthread_get_proc_refcount(uthread) != 0)) { + panic("system call returned with uu_proc_refcount != 0"); + } +#endif thread_exception_return(); /* NOTREACHED */ } - +__attribute__((noreturn)) void unix_syscall64(x86_saved_state_t *state) { - thread_t thread; - unsigned short code; - struct sysent *callp; - void *uargp; - int args_in_regs; - int error; - int funnel_type; - struct proc *p; - struct uthread *uthread; - unsigned int cancel_enable; + thread_t thread; + void *vt; + unsigned int code, syscode; + const struct sysent *callp; + int args_in_regs; + boolean_t args_start_at_rdi; + int error; + struct proc *p; + struct uthread *uthread; x86_saved_state64_t *regs; + pid_t pid; assert(is_saved_state64(state)); regs = saved_state64(state); - - if (regs->rax == 0x2000800) +#if DEBUG + if (regs->rax == 0x2000800) { thread_exception_return(); - + } +#endif thread = current_thread(); uthread = get_bsdthread_info(thread); + uthread_reset_proc_refcount(uthread); + /* Get the approriate proc; may be different from task's for vfork() */ - if (!(uthread->uu_flag & UT_VFORK)) + if (__probable(!(uthread->uu_flag & UT_VFORK))) { p = (struct proc *)get_bsdtask_info(current_task()); - else + } else { p = current_proc(); + } /* Verify that we are not being called from a task without a proc */ - if (p == NULL) { + if (__improbable(p == NULL)) { regs->rax = EPERM; regs->isf.rflags |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } - args_in_regs = 6; - code = regs->rax & SYSCALL_NUMBER_MASK; - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; - uargp = (void *)(®s->rdi); + code = regs->rax & SYSCALL_NUMBER_MASK; + syscode = (code < nsysent) ? code : SYS_invalid; + DEBUG_KPRINT_SYSCALL_UNIX( + "unix_syscall64: code=%d(%s) rip=%llx\n", + code, syscallnames[syscode], regs->isf.rip); + callp = &sysent[syscode]; + + vt = (void *)uthread->uu_arg; - if (callp == sysent) { - /* + if (__improbable(callp == sysent)) { + /* * indirect system call... system call number * passed as 'arg0' */ - code = regs->rdi; - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; - uargp = (void *)(®s->rsi); + code = regs->rdi; + syscode = (code < nsysent) ? code : SYS_invalid; + callp = &sysent[syscode]; + args_start_at_rdi = FALSE; args_in_regs = 5; + } else { + args_start_at_rdi = TRUE; + args_in_regs = 6; } if (callp->sy_narg != 0) { - if (code != 180) { - uint64_t *ip = (uint64_t *)uargp; + assert(callp->sy_narg <= 8); /* size of uu_arg */ + + args_in_regs = MIN(args_in_regs, callp->sy_narg); + memcpy(vt, args_start_at_rdi ? ®s->rdi : ®s->rsi, args_in_regs * sizeof(syscall_arg_t)); - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0); + if (!code_is_kdebug_trace(code)) { + uint64_t *uip = vt; + + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + uip[0], uip[1], uip[2], uip[3]); } - assert(callp->sy_narg <= 8); - if (callp->sy_narg > args_in_regs) { - int copyin_count; + if (__improbable(callp->sy_narg > args_in_regs)) { + int copyin_count; - copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t); + copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t); - error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)®s->v_arg6, copyin_count); + error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count); if (error) { - regs->rax = error; + regs->rax = error; regs->isf.rflags |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } } - /* - * XXX Turn 64 bit unsafe calls into nosys() - */ - if (callp->sy_funnel & UNSAFE_64BIT) { - callp = &sysent[63]; - goto unsafe; - } - - } else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - 0, 0, 0, 0, 0); -unsafe: + } else { + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START); + } /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ - if (uthread->uu_ucred != p->p_ucred && - (uthread->uu_flag & UT_SETUID) == 0) { - kauth_cred_t old = uthread->uu_ucred; - proc_lock(p); - uthread->uu_ucred = p->p_ucred; - kauth_cred_ref(uthread->uu_ucred); - proc_unlock(p); - if (old != NOCRED) - kauth_cred_rele(old); - } + kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; uthread->uu_rval[1] = 0; - - cancel_enable = callp->sy_cancel; - - if (cancel_enable == _SYSCALL_CANCEL_NONE) { - uthread->uu_flag |= UT_NOTCANCELPT; - } else { - if ((uthread->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) { - if (cancel_enable == _SYSCALL_CANCEL_PRE) { - /* system call cancelled; return to handle cancellation */ - regs->rax = EINTR; - regs->isf.rflags |= EFL_CF; - thread_exception_return(); - /* NOTREACHED */ - } else { - thread_abort_safely(thread); - } + uthread->uu_flag |= UT_NOTCANCELPT; + uthread->syscall_code = code; + pid = proc_pid(p); + +#ifdef JOE_DEBUG + uthread->uu_iocount = 0; + uthread->uu_vpindex = 0; +#endif + +#if CONFIG_MACF + if (__improbable(p->syscall_filter_mask != NULL && !bitstr_test(p->syscall_filter_mask, syscode))) { + error = mac_proc_check_syscall_unix(p, syscode); + if (error) { + goto skip_syscall; } } +#endif /* CONFIG_MACF */ - funnel_type = (callp->sy_funnel & FUNNEL_MASK); - if (funnel_type == KERNEL_FUNNEL) - thread_funnel_set(kernel_flock, TRUE); + AUDIT_SYSCALL_ENTER(code, p, uthread); + error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0])); + AUDIT_SYSCALL_EXIT(code, p, uthread, error); - if (KTRPOINT(p, KTR_SYSCALL)) - ktrsyscall(p, code, callp->sy_narg, uargp); +#if CONFIG_MACF +skip_syscall: +#endif /* CONFIG_MACF */ - AUDIT_SYSCALL_ENTER(code, p, uthread); - error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0])); - AUDIT_SYSCALL_EXIT(error, p, uthread); - - if (error == ERESTART) { +#ifdef JOE_DEBUG + if (uthread->uu_iocount) { + printf("system call returned with uu_iocount != 0\n"); + } +#endif + +#if CONFIG_DTRACE + uthread->t_dtrace_errno = error; +#endif /* CONFIG_DTRACE */ + + if (__improbable(error == ERESTART)) { /* * all system calls come through via the syscall instruction * in 64 bit mode... its 2 bytes in length * move the user's pc back to repeat the syscall: */ - regs->isf.rip -= 2; - } - else if (error != EJUSTRETURN) { - if (error) { - regs->rax = error; - regs->isf.rflags |= EFL_CF; /* carry bit */ + pal_syscall_restart( thread, state ); + } else if (error != EJUSTRETURN) { + if (__improbable(error)) { + regs->rax = error; + regs->isf.rflags |= EFL_CF; /* carry bit */ } else { /* (not error) */ - switch (callp->sy_return_type) { case _SYSCALL_RET_INT_T: regs->rax = uthread->uu_rval[0]; @@ -416,7 +451,8 @@ unsafe: case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: - regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); + case _SYSCALL_RET_UINT64_T: + regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); regs->rdx = 0; break; case _SYSCALL_RET_NONE: @@ -426,36 +462,42 @@ unsafe: break; } regs->isf.rflags &= ~EFL_CF; - } + } } - if (KTRPOINT(p, KTR_SYSRET)) - ktrsysret(p, code, error, uthread->uu_rval[0]); + DEBUG_KPRINT_SYSCALL_UNIX( + "unix_syscall64: error=%d retval=(%llu,%llu)\n", + error, regs->rax, regs->rdx); - if (cancel_enable == _SYSCALL_CANCEL_NONE) - uthread->uu_flag &= ~UT_NOTCANCELPT; + uthread->uu_flag &= ~UT_NOTCANCELPT; + uthread->syscall_code = 0; - /* - * if we're holding the funnel - * than drop it regardless of whether - * we took it on system call entry - */ - exit_funnel_section(); +#if DEBUG || DEVELOPMENT + kern_allocation_name_t + prior __assert_only = thread_set_allocation_name(NULL); + assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior)); +#endif /* DEBUG || DEVELOPMENT */ - if (uthread->uu_lowpri_delay) { - /* + if (__improbable(uthread->uu_lowpri_window)) { + /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ - IOSleep(uthread->uu_lowpri_delay); - uthread->uu_lowpri_delay = 0; + throttle_lowpri_io(1); + } + if (__probable(!code_is_kdebug_trace(code))) { + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], pid); + } + +#if PROC_REF_DEBUG + if (__improbable(uthread_get_proc_refcount(uthread))) { + panic("system call returned with uu_proc_refcount != 0"); } - if (code != 180) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0); +#endif thread_exception_return(); /* NOTREACHED */ @@ -465,152 +507,130 @@ unsafe: void unix_syscall_return(int error) { - thread_t thread; - struct uthread *uthread; + thread_t thread; + struct uthread *uthread; struct proc *p; - unsigned short code; - vm_offset_t params; - struct sysent *callp; - unsigned int cancel_enable; + unsigned int code; + const struct sysent *callp; thread = current_thread(); uthread = get_bsdthread_info(thread); + pal_register_cache_state(thread, DIRTY); + p = current_proc(); if (proc_is64bit(p)) { - x86_saved_state64_t *regs; + x86_saved_state64_t *regs; regs = saved_state64(find_user_regs(thread)); - /* reconstruct code for tracing before blasting rax */ - code = regs->rax & SYSCALL_NUMBER_MASK; - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; + code = uthread->syscall_code; + callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; - if (callp == sysent) - /* - * indirect system call... system call number - * passed as 'arg0' - */ - code = regs->rdi; +#if CONFIG_DTRACE + if (callp->sy_call == dtrace_systrace_syscall) { + dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); + } +#endif /* CONFIG_DTRACE */ + AUDIT_SYSCALL_EXIT(code, p, uthread, error); if (error == ERESTART) { - /* - * all system calls come through via the syscall instruction - * in 64 bit mode... its 2 bytes in length - * move the user's pc back to repeat the syscall: + /* + * repeat the syscall */ - regs->isf.rip -= 2; - } - else if (error != EJUSTRETURN) { - if (error) { - regs->rax = error; - regs->isf.rflags |= EFL_CF; /* carry bit */ + pal_syscall_restart( thread, find_user_regs(thread)); + } else if (error != EJUSTRETURN) { + if (error) { + regs->rax = error; + regs->isf.rflags |= EFL_CF; /* carry bit */ } else { /* (not error) */ - - switch (callp->sy_return_type) { + switch (callp->sy_return_type) { case _SYSCALL_RET_INT_T: - regs->rax = uthread->uu_rval[0]; + regs->rax = uthread->uu_rval[0]; regs->rdx = uthread->uu_rval[1]; break; case _SYSCALL_RET_UINT_T: - regs->rax = ((u_int)uthread->uu_rval[0]); + regs->rax = ((u_int)uthread->uu_rval[0]); regs->rdx = ((u_int)uthread->uu_rval[1]); break; case _SYSCALL_RET_OFF_T: case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: - regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); + case _SYSCALL_RET_UINT64_T: + regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); regs->rdx = 0; break; case _SYSCALL_RET_NONE: - break; + break; default: - panic("unix_syscall: unknown return type"); + panic("unix_syscall: unknown return type"); break; } regs->isf.rflags &= ~EFL_CF; - } + } } + DEBUG_KPRINT_SYSCALL_UNIX( + "unix_syscall_return: error=%d retval=(%llu,%llu)\n", + error, regs->rax, regs->rdx); } else { - x86_saved_state32_t *regs; + x86_saved_state32_t *regs; regs = saved_state32(find_user_regs(thread)); - /* reconstruct code for tracing before blasting eax */ - code = regs->eax; - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; + regs->efl &= ~(EFL_CF); - if (callp == sysent) { - params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int)); - code = fuword(params); + code = uthread->syscall_code; + callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code]; + +#if CONFIG_DTRACE + if (callp->sy_call == dtrace_systrace_syscall) { + dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); } +#endif /* CONFIG_DTRACE */ + AUDIT_SYSCALL_EXIT(code, p, uthread, error); + if (error == ERESTART) { - regs->eip -= ((regs->cs & 0xffff) == SYSENTER_CS) ? 5 : 2; - } - else if (error != EJUSTRETURN) { - if (error) { - regs->eax = error; - regs->efl |= EFL_CF; /* carry bit */ + pal_syscall_restart( thread, find_user_regs(thread)); + } else if (error != EJUSTRETURN) { + if (error) { + regs->eax = error; + regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ - regs->eax = uthread->uu_rval[0]; + regs->eax = uthread->uu_rval[0]; regs->edx = uthread->uu_rval[1]; - regs->efl &= ~EFL_CF; - } + } } + DEBUG_KPRINT_SYSCALL_UNIX( + "unix_syscall_return: error=%d retval=(%u,%u)\n", + error, regs->eax, regs->edx); } - if (KTRPOINT(p, KTR_SYSRET)) - ktrsysret(p, code, error, uthread->uu_rval[0]); - cancel_enable = callp->sy_cancel; - if (cancel_enable == _SYSCALL_CANCEL_NONE) - uthread->uu_flag &= ~UT_NOTCANCELPT; + uthread->uu_flag &= ~UT_NOTCANCELPT; - /* - * if we're holding the funnel - * than drop it regardless of whether - * we took it on system call entry - */ - exit_funnel_section(); +#if DEBUG || DEVELOPMENT + kern_allocation_name_t + prior __assert_only = thread_set_allocation_name(NULL); + assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior)); +#endif /* DEBUG || DEVELOPMENT */ - if (uthread->uu_lowpri_delay) { - /* + if (uthread->uu_lowpri_window) { + /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ - IOSleep(uthread->uu_lowpri_delay); - uthread->uu_lowpri_delay = 0; + throttle_lowpri_io(1); + } + if (!code_is_kdebug_trace(code)) { + KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid); } - if (code != 180) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0); thread_exception_return(); /* NOTREACHED */ } - -void -munge_wwwlww( - __unused const void *in32, - void *out64) -{ - uint32_t *arg32; - uint64_t *arg64; - - /* we convert in place in out64 */ - arg32 = (uint32_t *) out64; - arg64 = (uint64_t *) out64; - - arg64[5] = arg32[6]; /* wwwlwW */ - arg64[4] = arg32[5]; /* wwwlWw */ - arg32[7] = arg32[4]; /* wwwLww (hi) */ - arg32[6] = arg32[3]; /* wwwLww (lo) */ - arg64[2] = arg32[2]; /* wwWlww */ - arg64[1] = arg32[1]; /* wWwlww */ - arg64[0] = arg32[0]; /* Wwwlww */ -}