]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/dev/i386/systemcalls.c
xnu-2422.1.72.tar.gz
[apple/xnu.git] / bsd / dev / i386 / systemcalls.c
index f8c09f6fffb30e353ba287f1516db833b2db3f7b..77ecfba3a70c91585faafe684d09ad953578ebb7 100644 (file)
@@ -1,31 +1,29 @@
 /*
- * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
- * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
- * This file contains Original Code and/or Modifications of Original Code 
- * as defined in and that are subject to the Apple Public Source License 
- * Version 2.0 (the 'License'). You may not use this file except in 
- * compliance with the License.  The rights granted to you under the 
- * License may not be used to create, or enable the creation or 
- * redistribution of, unlawful or unlicensed copies of an Apple operating 
- * system, or to circumvent, violate, or enable the circumvention or 
- * violation of, any terms of an Apple operating system software license 
- * agreement.
- *
- * Please obtain a copy of the License at 
- * http://www.opensource.apple.com/apsl/ and read it before using this 
- * file.
- *
- * The Original Code and all software distributed under the License are 
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
- * Please see the License for the specific language governing rights and 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
  * limitations under the License.
- *
- * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 #include <kern/task.h>
 #include <kern/thread.h>
 #include <kern/clock.h>
 #include <kern/locks.h>
 #include <kern/sched_prim.h>
+#include <kern/debug.h>
 #include <mach/machine/thread_status.h>
 #include <mach/thread_act.h>
+#include <mach/branch_predicates.h>
 
 #include <sys/kernel.h>
 #include <sys/vm.h>
 #include <sys/systm.h>
 #include <sys/user.h>
 #include <sys/errno.h>
-#include <sys/ktrace.h>
 #include <sys/kdebug.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/kauth.h>
 #include <sys/systm.h>
 
-#include <bsm/audit_kernel.h>
+#include <security/audit/audit.h>
 
 #include <i386/seg.h>
 #include <i386/machine_routines.h>
 #include <mach/i386/syscall_sw.h>
 
+#include <machine/pal_routines.h>
+
+#if CONFIG_DTRACE
+extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
+extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
+#endif
+
 extern void unix_syscall(x86_saved_state_t *);
 extern void unix_syscall64(x86_saved_state_t *);
-extern void unix_syscall_return(int);
 extern void *find_user_regs(thread_t);
-extern void IOSleep(int);
-extern void exit_funnel_section(void);
 
-extern void Debugger(const char      * message);
+/* dynamically generated at build time based on syscalls.master */
+extern const char *syscallnames[];
+
+/*
+ * This needs to be a single switch so that it's "all on" or "all off",
+ * rather than being turned on for some code paths and not others, as this
+ * has a tendency to introduce "blame the next guy" bugs.
+ */
+#if DEBUG
+#define        FUNNEL_DEBUG    1       /* Check for funnel held on exit */
+#endif
 
 /*
  * Function:   unix_syscall
@@ -75,36 +88,36 @@ extern void Debugger(const char      * message);
 void
 unix_syscall(x86_saved_state_t *state)
 {
-       thread_t        thread;
-       void            *vt; 
-       unsigned short  code;
-       struct sysent   *callp;
-       int             nargs;
-       int             error;
-       int             funnel_type;
-       vm_offset_t     params;
-       struct proc     *p;
-       struct uthread  *uthread;
-       unsigned int cancel_enable;
+       thread_t                thread;
+       void                    *vt;
+       unsigned int            code;
+       struct sysent           *callp;
+
+       int                     error;
+       vm_offset_t             params;
+       struct proc             *p;
+       struct uthread          *uthread;
        x86_saved_state32_t     *regs;
+       boolean_t               is_vfork;
 
        assert(is_saved_state32(state));
        regs = saved_state32(state);
-
+#if DEBUG
        if (regs->eax == 0x800)
                thread_exception_return();
-
+#endif
        thread = current_thread();
        uthread = get_bsdthread_info(thread);
 
        /* Get the approriate proc; may be different from task's for vfork() */
-       if (!(uthread->uu_flag & UT_VFORK))
-               p = (struct proc *)get_bsdtask_info(current_task());
-       else 
+       is_vfork = uthread->uu_flag & UT_VFORK;
+       if (__improbable(is_vfork != 0))
                p = current_proc();
+       else 
+               p = (struct proc *)get_bsdtask_info(current_task());
 
        /* Verify that we are not being called from a task without a proc */
-       if (p == NULL) {
+       if (__improbable(p == NULL)) {
                regs->eax = EPERM;
                regs->efl |= EFL_CF;
                task_terminate_internal(current_task());
@@ -112,24 +125,30 @@ unix_syscall(x86_saved_state_t *state)
                /* NOTREACHED */
        }
 
-       //printf("[scall : eax %x]",  regs->eax);
-       code = regs->eax;
-       params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int));
-       callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
+       code = regs->eax & I386_SYSCALL_NUMBER_MASK;
+       DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
+                                                         code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
+       params = (vm_offset_t) (regs->uesp + sizeof (int));
+
+       regs->efl &= ~(EFL_CF);
 
-       if (callp == sysent) {
+       callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
+
+       if (__improbable(callp == sysent)) {
                code = fuword(params);
-               params += sizeof (int);
-               callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
+               params += sizeof(int);
+               callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
        }
+
        vt = (void *)uthread->uu_arg;
+       uthread->uu_ap = vt;
 
-       nargs = callp->sy_narg * sizeof (syscall_arg_t);
-       if (nargs != 0) {
+       if (callp->sy_arg_bytes != 0) {
                sy_munge_t      *mungerp;
+               uint32_t         nargs;
 
-               assert(nargs <= 8);
-
+               assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg));
+               nargs = callp->sy_arg_bytes;
                error = copyin((user_addr_t) params, (char *) vt, nargs);
                if (error) {
                        regs->eax = error;
@@ -137,17 +156,19 @@ unix_syscall(x86_saved_state_t *state)
                        thread_exception_return();
                        /* NOTREACHED */
                }
-               if (code != 180) {
-                       int *ip = (int *)vt;
 
-                       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                                             *ip, *(ip+1), *(ip+2), *(ip+3), 0);
+               if (__probable(code != 180)) {
+                       int *ip = (int *)vt;
+
+                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                               BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                               *ip, *(ip+1), *(ip+2), *(ip+3), 0);
                }
                mungerp = callp->sy_arg_munge32;
 
                /*
                 * If non-NULL, then call the syscall argument munger to
-                * copy in arguments (see xnu/bsd/dev/i386/munge.s); the
+                * copy in arguments (see xnu/bsd/dev/{i386|x86_64}/munge.s); the
                 * first argument is NULL because we are munging in place
                 * after a copyin because the ABI currently doesn't use
                 * registers to pass system call arguments.
@@ -155,92 +176,71 @@ unix_syscall(x86_saved_state_t *state)
                if (mungerp != NULL)
                        (*mungerp)(NULL, vt);
        } else
-               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                                     0, 0, 0, 0, 0);
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
+                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                       0, 0, 0, 0, 0);
+
        /*
         * Delayed binding of thread credential to process credential, if we
         * are not running with an explicitly set thread credential.
         */
-       if (uthread->uu_ucred != p->p_ucred &&
-           (uthread->uu_flag & UT_SETUID) == 0) {
-               kauth_cred_t old = uthread->uu_ucred;
-               proc_lock(p);
-               uthread->uu_ucred = p->p_ucred;
-               kauth_cred_ref(uthread->uu_ucred);
-               proc_unlock(p);
-               if (old != NOCRED)
-                       kauth_cred_rele(old);
-       }
+       kauth_cred_uthread_update(uthread, p);
 
        uthread->uu_rval[0] = 0;
        uthread->uu_rval[1] = regs->edx;
+       uthread->uu_flag |= UT_NOTCANCELPT;
 
-       cancel_enable = callp->sy_cancel;
-       
-       if (cancel_enable == _SYSCALL_CANCEL_NONE) {
-               uthread->uu_flag |= UT_NOTCANCELPT;
-       } else {
-               if ((uthread->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) {
-                       if (cancel_enable == _SYSCALL_CANCEL_PRE) {
-                                       /* system call cancelled; return to handle cancellation */
-                                       regs->eax = (long long)EINTR;
-                                       regs->efl |= EFL_CF;
-                                       thread_exception_return();
-                                       /* NOTREACHED */
-                       } else {
-                               thread_abort_safely(thread);
-                       }
-               }
-       }
-
-       funnel_type = (callp->sy_funnel & FUNNEL_MASK);
-       if (funnel_type == KERNEL_FUNNEL)
-               thread_funnel_set(kernel_flock, TRUE);
 
-       if (KTRPOINT(p, KTR_SYSCALL))
-               ktrsyscall(p, code, callp->sy_narg, vt);
+#ifdef JOE_DEBUG
+        uthread->uu_iocount = 0;
+        uthread->uu_vpindex = 0;
+#endif
 
        AUDIT_SYSCALL_ENTER(code, p, uthread);
        error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
-       AUDIT_SYSCALL_EXIT(error, p, uthread);
-       
-       if (error == ERESTART) {
+        AUDIT_SYSCALL_EXIT(code, p, uthread, error);
+
+#ifdef JOE_DEBUG
+        if (uthread->uu_iocount)
+                printf("system call returned with uu_iocount != 0\n");
+#endif
+#if CONFIG_DTRACE
+       uthread->t_dtrace_errno = error;
+#endif /* CONFIG_DTRACE */
+
+       if (__improbable(error == ERESTART)) {
                /*
                 * Move the user's pc back to repeat the syscall:
                 * 5 bytes for a sysenter, or 2 for an int 8x.
                 * The SYSENTER_TF_CS covers single-stepping over a sysenter
                 * - see debug trap handler in idt.s/idt64.s
                 */
-               if (regs->cs == SYSENTER_CS || regs->cs == SYSENTER_TF_CS)
-                       regs->eip -= 5;
-               else
-                       regs->eip -= 2;
+
+               pal_syscall_restart(thread, state);
        }
        else if (error != EJUSTRETURN) {
-               if (error) {
+               if (__improbable(error)) {
                    regs->eax = error;
                    regs->efl |= EFL_CF;        /* carry bit */
                } else { /* (not error) */
                    regs->eax = uthread->uu_rval[0];
                    regs->edx = uthread->uu_rval[1];
-                   regs->efl &= ~EFL_CF;
                } 
        }
 
-       if (KTRPOINT(p, KTR_SYSRET))
-               ktrsysret(p, code, error, uthread->uu_rval[0]);
-
-       if (cancel_enable == _SYSCALL_CANCEL_NONE)
-                uthread->uu_flag &= ~UT_NOTCANCELPT;
+       DEBUG_KPRINT_SYSCALL_UNIX(
+               "unix_syscall: error=%d retval=(%u,%u)\n",
+               error, regs->eax, regs->edx);
 
+       uthread->uu_flag &= ~UT_NOTCANCELPT;
+#if FUNNEL_DEBUG
        /*
-        * if we're holding the funnel
-        * than drop it regardless of whether
-        * we took it on system call entry
+        * if we're holding the funnel panic
         */
-       exit_funnel_section();
+       syscall_exit_funnelcheck();
+#endif /* FUNNEL_DEBUG */
 
-       if (uthread->uu_lowpri_delay) {
+       if (__improbable(uthread->uu_lowpri_window)) {
                /*
                 * task is marked as a low priority I/O type
                 * and the I/O we issued while in this system call
@@ -248,12 +248,16 @@ unix_syscall(x86_saved_state_t *state)
                 * delay in order to mitigate the impact of this
                 * task on the normal operation of the system
                 */
-               IOSleep(uthread->uu_lowpri_delay);
-               uthread->uu_lowpri_delay = 0;
+               throttle_lowpri_io(1);
+       }
+       if (__probable(code != 180))
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
+                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+                       error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
+
+       if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) {
+               pal_execve_return(thread);
        }
-       if (code != 180)
-               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
-                                     error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0);
 
        thread_exception_return();
        /* NOTREACHED */
@@ -264,34 +268,32 @@ void
 unix_syscall64(x86_saved_state_t *state)
 {
        thread_t        thread;
-       unsigned short  code;
+       unsigned int    code;
        struct sysent   *callp;
        void            *uargp;
        int             args_in_regs;
        int             error;
-       int             funnel_type;
        struct proc     *p;
        struct uthread  *uthread;
-       unsigned int cancel_enable;
        x86_saved_state64_t *regs;
 
        assert(is_saved_state64(state));
        regs = saved_state64(state);
-
+#if    DEBUG
        if (regs->rax == 0x2000800)
                thread_exception_return();
-
+#endif
        thread = current_thread();
        uthread = get_bsdthread_info(thread);
 
        /* Get the approriate proc; may be different from task's for vfork() */
-       if (!(uthread->uu_flag & UT_VFORK))
+       if (__probable(!(uthread->uu_flag & UT_VFORK)))
                p = (struct proc *)get_bsdtask_info(current_task());
        else 
                p = current_proc();
 
        /* Verify that we are not being called from a task without a proc */
-       if (p == NULL) {
+       if (__improbable(p == NULL)) {
                regs->rax = EPERM;
                regs->isf.rflags |= EFL_CF;
                task_terminate_internal(current_task());
@@ -301,113 +303,93 @@ unix_syscall64(x86_saved_state_t *state)
        args_in_regs = 6;
 
        code = regs->rax & SYSCALL_NUMBER_MASK;
-       callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
+       DEBUG_KPRINT_SYSCALL_UNIX(
+               "unix_syscall64: code=%d(%s) rip=%llx\n",
+               code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
+       callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
        uargp = (void *)(&regs->rdi);
 
-       if (callp == sysent) {
+       if (__improbable(callp == sysent)) {
                /*
                 * indirect system call... system call number
                 * passed as 'arg0'
                 */
                code = regs->rdi;
-               callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
+               callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
                uargp = (void *)(&regs->rsi);
                args_in_regs = 5;
        }
+       uthread->uu_ap = uargp;
 
        if (callp->sy_narg != 0) {
                if (code != 180) {
-                       uint64_t *ip = (uint64_t *)uargp;
+                       uint64_t *ip = (uint64_t *)uargp;
 
-                       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                                             (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
+                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
+                               BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                               (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
                }
-               assert(callp->sy_narg <= 8);
+               assert(callp->sy_narg <= 8);
 
-               if (callp->sy_narg > args_in_regs) {
-                       int copyin_count;
+               if (__improbable(callp->sy_narg > args_in_regs)) {
+                       int copyin_count;
 
-                       copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t);
+                       copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t);
 
-                       error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&regs->v_arg6, copyin_count);
+                       error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&regs->v_arg6, copyin_count);
                        if (error) {
-                               regs->rax = error;
+                               regs->rax = error;
                                regs->isf.rflags |= EFL_CF;
                                thread_exception_return();
                                /* NOTREACHED */
                        }
                }
-               /*
-                * XXX Turn 64 bit unsafe calls into nosys()
-                */
-               if (callp->sy_funnel & UNSAFE_64BIT) {
-                       callp = &sysent[63];
-                       goto unsafe;
-               }
-
        } else
-               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
-                                     0, 0, 0, 0, 0);
-unsafe:
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
+                       0, 0, 0, 0, 0);
 
        /*
         * Delayed binding of thread credential to process credential, if we
         * are not running with an explicitly set thread credential.
         */
-       if (uthread->uu_ucred != p->p_ucred &&
-           (uthread->uu_flag & UT_SETUID) == 0) {
-               kauth_cred_t old = uthread->uu_ucred;
-               proc_lock(p);
-               uthread->uu_ucred = p->p_ucred;
-               kauth_cred_ref(uthread->uu_ucred);
-               proc_unlock(p);
-               if (old != NOCRED)
-                       kauth_cred_rele(old);
-       }
+       kauth_cred_uthread_update(uthread, p);
 
        uthread->uu_rval[0] = 0;
        uthread->uu_rval[1] = 0;
 
-       cancel_enable = callp->sy_cancel;
        
-       if (cancel_enable == _SYSCALL_CANCEL_NONE) {
-               uthread->uu_flag |= UT_NOTCANCELPT;
-       } else {
-               if ((uthread->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) {
-                       if (cancel_enable == _SYSCALL_CANCEL_PRE) {
-                                       /* system call cancelled; return to handle cancellation */
-                                       regs->rax = EINTR;
-                                       regs->isf.rflags |= EFL_CF;
-                                       thread_exception_return();
-                                       /* NOTREACHED */
-                       } else {
-                               thread_abort_safely(thread);
-                       }
-               }
-       }
+       uthread->uu_flag |= UT_NOTCANCELPT;
 
-       funnel_type = (callp->sy_funnel & FUNNEL_MASK);
-       if (funnel_type == KERNEL_FUNNEL)
-               thread_funnel_set(kernel_flock, TRUE);
-
-       if (KTRPOINT(p, KTR_SYSCALL))
-               ktrsyscall(p, code, callp->sy_narg, uargp);
+#ifdef JOE_DEBUG
+        uthread->uu_iocount = 0;
+        uthread->uu_vpindex = 0;
+#endif
 
        AUDIT_SYSCALL_ENTER(code, p, uthread);
        error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0]));
-       AUDIT_SYSCALL_EXIT(error, p, uthread);
+        AUDIT_SYSCALL_EXIT(code, p, uthread, error);
+
+#ifdef JOE_DEBUG
+        if (uthread->uu_iocount)
+               printf("system call returned with uu_iocount != 0\n");
+#endif
+
+#if CONFIG_DTRACE
+       uthread->t_dtrace_errno = error;
+#endif /* CONFIG_DTRACE */
        
-       if (error == ERESTART) {
+       if (__improbable(error == ERESTART)) {
                /*
                 * all system calls come through via the syscall instruction
                 * in 64 bit mode... its 2 bytes in length
                 * move the user's pc back to repeat the syscall:
                 */
-               regs->isf.rip -= 2;
+               pal_syscall_restart( thread, state );
        }
        else if (error != EJUSTRETURN) {
-               if (error) {
-                       regs->rax = error;
+               if (__improbable(error)) {
+                       regs->rax = error;
                        regs->isf.rflags |= EFL_CF;     /* carry bit */
                } else { /* (not error) */
 
@@ -424,6 +406,7 @@ unsafe:
                        case _SYSCALL_RET_ADDR_T:
                        case _SYSCALL_RET_SIZE_T:
                        case _SYSCALL_RET_SSIZE_T:
+                       case _SYSCALL_RET_UINT64_T:
                                regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
                                regs->rdx = 0;
                                break;
@@ -437,20 +420,20 @@ unsafe:
                } 
        }
 
-       if (KTRPOINT(p, KTR_SYSRET))
-               ktrsysret(p, code, error, uthread->uu_rval[0]);
-
-       if (cancel_enable == _SYSCALL_CANCEL_NONE)
-                uthread->uu_flag &= ~UT_NOTCANCELPT;
+       DEBUG_KPRINT_SYSCALL_UNIX(
+               "unix_syscall64: error=%d retval=(%llu,%llu)\n",
+               error, regs->rax, regs->rdx);
+       
+       uthread->uu_flag &= ~UT_NOTCANCELPT;
 
+#if FUNNEL_DEBUG       
        /*
-        * if we're holding the funnel
-        * than drop it regardless of whether
-        * we took it on system call entry
+        * if we're holding the funnel panic
         */
-       exit_funnel_section();
+       syscall_exit_funnelcheck();
+#endif /* FUNNEL_DEBUG */
 
-       if (uthread->uu_lowpri_delay) {
+       if (__improbable(uthread->uu_lowpri_window)) {
                /*
                 * task is marked as a low priority I/O type
                 * and the I/O we issued while in this system call
@@ -458,12 +441,12 @@ unsafe:
                 * delay in order to mitigate the impact of this
                 * task on the normal operation of the system
                 */
-               IOSleep(uthread->uu_lowpri_delay);
-               uthread->uu_lowpri_delay = 0;
+               throttle_lowpri_io(1);
        }
-       if (code != 180)
-               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
-                                     error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0);
+       if (__probable(code != 180))
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
+                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+                       error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
 
        thread_exception_return();
        /* NOTREACHED */
@@ -476,114 +459,128 @@ unix_syscall_return(int error)
        thread_t                thread;
        struct uthread          *uthread;
        struct proc *p;
-       unsigned short code;
+       unsigned int code;
        vm_offset_t params;
        struct sysent *callp;
-       unsigned int cancel_enable;
 
        thread = current_thread();
        uthread = get_bsdthread_info(thread);
 
+       pal_register_cache_state(thread, DIRTY);
+
        p = current_proc();
 
        if (proc_is64bit(p)) {
-               x86_saved_state64_t *regs;
+               x86_saved_state64_t *regs;
 
                regs = saved_state64(find_user_regs(thread));
 
-               /* reconstruct code for tracing before blasting rax */
+               /* reconstruct code for tracing before blasting rax */
                code = regs->rax & SYSCALL_NUMBER_MASK;
-               callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
+               callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
 
                if (callp == sysent)
-                       /*
+                       /*
                         * indirect system call... system call number
                         * passed as 'arg0'
                         */
-                       code = regs->rdi;
+                       code = regs->rdi;
+
+#if CONFIG_DTRACE
+               if (callp->sy_call == dtrace_systrace_syscall)
+                       dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
+#endif /* CONFIG_DTRACE */
+               AUDIT_SYSCALL_EXIT(code, p, uthread, error);
 
                if (error == ERESTART) {
-                       /*
-                        * all system calls come through via the syscall instruction
-                        * in 64 bit mode... its 2 bytes in length
-                        * move the user's pc back to repeat the syscall:
+                       /*
+                        * repeat the syscall
                         */
-                       regs->isf.rip -= 2;
+                       pal_syscall_restart( thread, find_user_regs(thread) );
                }
                else if (error != EJUSTRETURN) {
-                       if (error) {
-                               regs->rax = error;
+                       if (error) {
+                               regs->rax = error;
                                regs->isf.rflags |= EFL_CF;     /* carry bit */
                        } else { /* (not error) */
 
-                               switch (callp->sy_return_type) {
+                               switch (callp->sy_return_type) {
                                case _SYSCALL_RET_INT_T:
-                                       regs->rax = uthread->uu_rval[0];
+                                       regs->rax = uthread->uu_rval[0];
                                        regs->rdx = uthread->uu_rval[1];
                                        break;
                                case _SYSCALL_RET_UINT_T:
-                                       regs->rax = ((u_int)uthread->uu_rval[0]);
+                                       regs->rax = ((u_int)uthread->uu_rval[0]);
                                        regs->rdx = ((u_int)uthread->uu_rval[1]);
                                        break;
                                case _SYSCALL_RET_OFF_T:
                                case _SYSCALL_RET_ADDR_T:
                                case _SYSCALL_RET_SIZE_T:
                                case _SYSCALL_RET_SSIZE_T:
-                                       regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
+                               case _SYSCALL_RET_UINT64_T:
+                                       regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
                                        regs->rdx = 0;
                                        break;
                                case _SYSCALL_RET_NONE:
-                                       break;
+                                       break;
                                default:
-                                       panic("unix_syscall: unknown return type");
+                                       panic("unix_syscall: unknown return type");
                                        break;
                                }
                                regs->isf.rflags &= ~EFL_CF;
                        } 
                }
+               DEBUG_KPRINT_SYSCALL_UNIX(
+                       "unix_syscall_return: error=%d retval=(%llu,%llu)\n",
+                       error, regs->rax, regs->rdx);
        } else {
-               x86_saved_state32_t     *regs;
+               x86_saved_state32_t     *regs;
 
                regs = saved_state32(find_user_regs(thread));
 
+               regs->efl &= ~(EFL_CF);
                /* reconstruct code for tracing before blasting eax */
-               code = regs->eax;
-               callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
+               code = regs->eax & I386_SYSCALL_NUMBER_MASK;
+               callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
+
+#if CONFIG_DTRACE
+               if (callp->sy_call == dtrace_systrace_syscall)
+                       dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
+#endif /* CONFIG_DTRACE */
+               AUDIT_SYSCALL_EXIT(code, p, uthread, error);
 
                if (callp == sysent) {
-                       params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int));
-                       code = fuword(params);
+                       params = (vm_offset_t) (regs->uesp + sizeof (int));
+                       code = fuword(params);
                }
                if (error == ERESTART) {
-                       regs->eip -= ((regs->cs & 0xffff) == SYSENTER_CS) ? 5 : 2;
+                       pal_syscall_restart( thread, find_user_regs(thread) );
                }
                else if (error != EJUSTRETURN) {
-                       if (error) {
-                               regs->eax = error;
+                       if (error) {
+                               regs->eax = error;
                                regs->efl |= EFL_CF;    /* carry bit */
                        } else { /* (not error) */
-                               regs->eax = uthread->uu_rval[0];
+                               regs->eax = uthread->uu_rval[0];
                                regs->edx = uthread->uu_rval[1];
-                               regs->efl &= ~EFL_CF;
                        } 
                }
+               DEBUG_KPRINT_SYSCALL_UNIX(
+                       "unix_syscall_return: error=%d retval=(%u,%u)\n",
+                       error, regs->eax, regs->edx);
        }
-       if (KTRPOINT(p, KTR_SYSRET))
-               ktrsysret(p, code, error, uthread->uu_rval[0]);
 
-       cancel_enable = callp->sy_cancel;
 
-       if (cancel_enable == _SYSCALL_CANCEL_NONE)
-                uthread->uu_flag &= ~UT_NOTCANCELPT;
+       uthread->uu_flag &= ~UT_NOTCANCELPT;
 
+#if FUNNEL_DEBUG       
        /*
-        * if we're holding the funnel
-        * than drop it regardless of whether
-        * we took it on system call entry
+        * if we're holding the funnel panic
         */
-       exit_funnel_section();
+       syscall_exit_funnelcheck();
+#endif /* FUNNEL_DEBUG */
 
-       if (uthread->uu_lowpri_delay) {
+       if (uthread->uu_lowpri_window) {
                /*
                 * task is marked as a low priority I/O type
                 * and the I/O we issued while in this system call
@@ -591,34 +588,14 @@ unix_syscall_return(int error)
                 * delay in order to mitigate the impact of this
                 * task on the normal operation of the system
                 */
-               IOSleep(uthread->uu_lowpri_delay);
-               uthread->uu_lowpri_delay = 0;
+               throttle_lowpri_io(1);
        }
        if (code != 180)
-               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
-                                     error, uthread->uu_rval[0], uthread->uu_rval[1], 0, 0);
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
+                       BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
+                       error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
 
        thread_exception_return();
        /* NOTREACHED */
 }
 
-void
-munge_wwwlww(
-       __unused const void     *in32,
-       void                    *out64)
-{
-       uint32_t        *arg32;
-       uint64_t        *arg64;
-
-       /* we convert in place in out64 */
-       arg32 = (uint32_t *) out64;
-       arg64 = (uint64_t *) out64;
-
-       arg64[5] = arg32[6];    /* wwwlwW */
-       arg64[4] = arg32[5];    /* wwwlWw */
-       arg32[7] = arg32[4];    /* wwwLww (hi) */
-       arg32[6] = arg32[3];    /* wwwLww (lo) */
-       arg64[2] = arg32[2];    /* wwWlww */
-       arg64[1] = arg32[1];    /* wWwlww */
-       arg64[0] = arg32[0];    /* Wwwlww */
-}