X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/6d2010ae8f7a6078e10b361c6962983bab233e0f..527f99514973766e9c0382a4d8550dfb00f54939:/osfmk/x86_64/idt64.s?ds=inline diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s index fe6cb1295..13f3f0218 100644 --- a/osfmk/x86_64/idt64.s +++ b/osfmk/x86_64/idt64.s @@ -27,7 +27,7 @@ */ #include #include -#include +#include #include #include #include @@ -66,32 +66,41 @@ * syscall - synchronous system call request * fatal - fatal traps */ - /* - * Handlers: + * Indices of handlers for each exception type. */ -#define HNDL_ALLINTRS EXT(hndl_allintrs) -#define HNDL_ALLTRAPS EXT(hndl_alltraps) -#define HNDL_SYSENTER EXT(hndl_sysenter) -#define HNDL_SYSCALL EXT(hndl_syscall) -#define HNDL_UNIX_SCALL EXT(hndl_unix_scall) -#define HNDL_MACH_SCALL EXT(hndl_mach_scall) -#define HNDL_MDEP_SCALL EXT(hndl_mdep_scall) -#define HNDL_DIAG_SCALL EXT(hndl_diag_scall) -#define HNDL_DOUBLE_FAULT EXT(hndl_double_fault) -#define HNDL_MACHINE_CHECK EXT(hndl_machine_check) - - -#if 1 -#define PUSH_FUNCTION(func) \ - sub $8, %rsp ;\ - push %rax ;\ - leaq func(%rip), %rax ;\ - movq %rax, 8(%rsp) ;\ - pop %rax -#else -#define PUSH_FUNCTION(func) pushq func -#endif +#define HNDL_ALLINTRS 0 +#define HNDL_ALLTRAPS 1 +#define HNDL_SYSENTER 2 +#define HNDL_SYSCALL 3 +#define HNDL_UNIX_SCALL 4 +#define HNDL_MACH_SCALL 5 +#define HNDL_MDEP_SCALL 6 +#define HNDL_DOUBLE_FAULT 7 +#define HNDL_MACHINE_CHECK 8 + +/* Begin double-mapped descriptor section */ + +.section __HIB, __desc +.globl EXT(idt64_hndl_table0) +EXT(idt64_hndl_table0): + .quad EXT(ks_dispatch) + .quad EXT(ks_64bit_return) + .quad 0 /* Populated with CPU shadow displacement*/ + .quad EXT(ks_return) + +EXT(idt64_hndl_table1): + .quad EXT(hndl_allintrs) + .quad EXT(hndl_alltraps) + .quad EXT(hndl_sysenter) + .quad EXT(hndl_syscall) + .quad EXT(hndl_unix_scall) + .quad EXT(hndl_mach_scall) + .quad EXT(hndl_mdep_scall) + .quad EXT(hndl_double_fault) + .quad EXT(hndl_machine_check) +.text + /* The wrapper for all non-special traps/interrupts */ /* Everything up to PUSH_FUNCTION is just to output @@ -102,12 +111,12 @@ push %rax ;\ POSTCODE2(0x6400+n) ;\ pop %rax ;\ - PUSH_FUNCTION(f) ;\ + pushq $(f) ;\ pushq $(n) ;\ jmp L_dispatch #else #define IDT_ENTRY_WRAPPER(n, f) \ - PUSH_FUNCTION(f) ;\ + pushq $(f) ;\ pushq $(n) ;\ jmp L_dispatch #endif @@ -133,98 +142,357 @@ /* A trap with a special-case handler, hence we don't need to define anything */ #define TRAP_SPC(n, f) -#define TRAP_IST(n, f) +#define TRAP_IST1(n, f) +#define TRAP_IST2(n, f) #define USER_TRAP_SPC(n, f) +/* Begin double-mapped text section */ +.section __HIB, __text /* Generate all the stubs */ #include "idt_table.h" +Entry(idt64_page_fault) + pushq $(HNDL_ALLTRAPS) + push $(T_PAGE_FAULT) + jmp L_dispatch + +Entry(idt64_debug) + push $0 /* error code */ + pushq $(HNDL_ALLTRAPS) + pushq $(T_DEBUG) + jmp L_dispatch +/* + * Legacy interrupt gate System call handlers. + * These are entered via a syscall interrupt. The system call number in %rax + * is saved to the error code slot in the stack frame. We then branch to the + * common state saving code. + */ + +#ifndef UNIX_INT +#error NO UNIX INT!!! +#endif +Entry(idt64_unix_scall) + pushq %rax /* save system call number */ + pushq $(HNDL_UNIX_SCALL) + pushq $(UNIX_INT) + jmp L_dispatch + +Entry(idt64_mach_scall) + pushq %rax /* save system call number */ + pushq $(HNDL_MACH_SCALL) + pushq $(MACH_INT) + jmp L_dispatch + +Entry(idt64_mdep_scall) + pushq %rax /* save system call number */ + pushq $(HNDL_MDEP_SCALL) + pushq $(MACHDEP_INT) + jmp L_dispatch + +/* + * For GP/NP/SS faults, we use the IST1 stack. + * For faults from user-space, we have to copy the machine state to the + * PCB stack and then dispatch as normal. + * For faults in kernel-space, we need to scrub for kernel exit faults and + * treat these as user-space faults. But for all other kernel-space faults + * we continue to run on the IST1 stack and we dispatch to handle the fault + * as fatal. + */ +Entry(idt64_gen_prot) + pushq $(HNDL_ALLTRAPS) + pushq $(T_GENERAL_PROTECTION) + jmp L_dispatch + +Entry(idt64_stack_fault) + pushq $(HNDL_ALLTRAPS) + pushq $(T_STACK_FAULT) + jmp L_dispatch + +Entry(idt64_segnp) + pushq $(HNDL_ALLTRAPS) + pushq $(T_SEGMENT_NOT_PRESENT) + jmp L_dispatch + +/* + * Fatal exception handlers: + */ +Entry(idt64_db_task_dbl_fault) + pushq $(HNDL_DOUBLE_FAULT) + pushq $(T_DOUBLE_FAULT) + jmp L_dispatch + +Entry(idt64_db_task_stk_fault) + pushq $(HNDL_DOUBLE_FAULT) + pushq $(T_STACK_FAULT) + jmp L_dispatch + +Entry(idt64_mc) + push $(0) /* Error */ + pushq $(HNDL_MACHINE_CHECK) + pushq $(T_MACHINE_CHECK) + jmp L_dispatch + +/* + * NMI + * This may or may not be fatal but extreme care is required + * because it may fall when control was already in another trampoline. + * + * We get here on IST2 stack which is used for NMIs only. + */ +Entry(idt64_nmi) + push %rax /* save RAX to ISF64_ERR */ + push %rcx /* save RCX to ISF64_TRAPFN */ + push %rdx /* save RDX to ISF64_TRAPNO */ + jmp L_dispatch + +Entry(idt64_double_fault) + pushq $(HNDL_DOUBLE_FAULT) + pushq $(T_DOUBLE_FAULT) + jmp L_dispatch + +Entry(hi64_syscall) +Entry(idt64_syscall) + swapgs + /* Use RAX as a temporary by shifting its contents into R11[32:63] + * The systemcall number is defined to be a 32-bit quantity, as is + * RFLAGS. + */ + shlq $32, %rax + or %rax, %r11 +.globl EXT(dblsyscall_patch_point) +EXT(dblsyscall_patch_point): +// movabsq $0x12345678ABCDEFFFULL, %rax + /* Generate offset to the double-mapped per-CPU data shadow + * into RAX + */ + leaq EXT(idt64_hndl_table0)(%rip), %rax + mov 16(%rax), %rax + mov %rsp, %gs:CPU_UBER_TMP(%rax) /* save user stack */ + mov %gs:CPU_ESTACK(%rax), %rsp /* switch stack to per-cpu estack */ + sub $(ISF64_SIZE), %rsp + + /* + * Synthesize an ISF frame on the exception stack + */ + movl $(USER_DS), ISF64_SS(%rsp) + mov %rcx, ISF64_RIP(%rsp) /* rip */ + + mov %gs:CPU_UBER_TMP(%rax), %rcx + mov %rcx, ISF64_RSP(%rsp) /* user stack --changed */ + + mov %r11, %rax + shrq $32, %rax /* Restore RAX */ + mov %r11d, %r11d /* Clear r11[32:63] */ + + mov %r11, ISF64_RFLAGS(%rsp) /* rflags */ + movl $(SYSCALL_CS), ISF64_CS(%rsp) /* cs - a pseudo-segment */ + mov %rax, ISF64_ERR(%rsp) /* err/rax - syscall code */ + movq $(HNDL_SYSCALL), ISF64_TRAPFN(%rsp) + movq $(T_SYSCALL), ISF64_TRAPNO(%rsp) /* trapno */ + swapgs + jmp L_dispatch /* this can only be 64-bit */ + +Entry(hi64_sysenter) +Entry(idt64_sysenter) + /* Synthesize an interrupt stack frame onto the + * exception stack. + */ + push $(USER_DS) /* ss */ + push %rcx /* uesp */ + pushf /* flags */ + /* + * Clear, among others, the Nested Task (NT) flags bit; + * this is zeroed by INT, but not by SYSENTER. + */ + push $0 + popf + push $(SYSENTER_CS) /* cs */ +L_sysenter_continue: + push %rdx /* eip */ + push %rax /* err/eax - syscall code */ + pushq $(HNDL_SYSENTER) + pushq $(T_SYSENTER) + orl $(EFL_IF), ISF64_RFLAGS(%rsp) + jmp L_dispatch + /* * Common dispatch point. * Determine what mode has been interrupted and save state accordingly. + * Here with: + * rsp from user-space: interrupt state in PCB, or + * from kernel-space: interrupt state in kernel or interrupt stack + * GSBASE from user-space: pthread area, or + * from kernel-space: cpu_data */ -L_dispatch: - cmpl $(KERNEL64_CS), ISF64_CS(%rsp) - je L_64bit_dispatch +L_dispatch: + pushq %rax + testb $3, 8+ISF64_CS(%rsp) + jz 1f + swapgs + leaq EXT(idt64_hndl_table0)(%rip), %rax + mov 16(%rax), %rax + + mov %gs:CPU_TASK_CR3(%rax), %rax + mov %rax, %cr3 +#if DEBUG + mov %rax, %gs:CPU_ENTRY_CR3 +#endif +1: + /* The text/data relationship here must be preserved in the doublemap, and the contents must be remapped */ + leaq EXT(idt64_hndl_table0)(%rip), %rax + /* Indirect branch to non-doublemapped trampolines */ + jmp *(%rax) +/* User return: register restoration and address space switch sequence */ +Entry(ks_64bit_return) + mov R64_R14(%r15), %r14 + mov R64_R13(%r15), %r13 + mov R64_R12(%r15), %r12 + mov R64_R11(%r15), %r11 + mov R64_R10(%r15), %r10 + mov R64_R9(%r15), %r9 + mov R64_R8(%r15), %r8 + mov R64_RSI(%r15), %rsi + mov R64_RDI(%r15), %rdi + mov R64_RBP(%r15), %rbp + mov R64_RDX(%r15), %rdx + mov R64_RCX(%r15), %rcx + mov R64_RBX(%r15), %rbx + mov R64_RAX(%r15), %rax + /* Switch to per-CPU exception stack */ + mov %gs:CPU_ESTACK, %rsp + + /* Synthesize interrupt stack frame from PCB savearea to exception stack */ + push R64_SS(%r15) + push R64_RSP(%r15) + push R64_RFLAGS(%r15) + push R64_CS(%r15) + push R64_RIP(%r15) + + mov R64_R15(%r15), %r15 + cmpq $(KERNEL64_CS), 8(%rsp) + jz 1f + /* Discover user cr3/ASID */ + push %rax + mov %gs:CPU_UCR3, %rax +#if DEBUG + mov %rax, %gs:CPU_EXIT_CR3 +#endif + mov %rax, %cr3 + /* Continue execution on the shared/doublemapped trampoline */ + pop %rax swapgs - +1: + cmpl $(SYSCALL_CS), 8(%rsp) /* test for exit via SYSRET */ + je L_sysret +EXT(ret64_iret): + iretq /* return from interrupt */ +L_sysret: /* - * Check for trap from EFI32, and restore cr3 and rsp if so. - * A trap from EFI32 is fatal. + * Here to restore rcx/r11/rsp and perform the sysret back to user-space. + * rcx user rip + * r11 user rflags + * rsp user stack pointer */ - cmpl $(KERNEL32_CS), ISF64_CS(%rsp) - jne L_dispatch_continue - push %rcx - mov EXT(pal_efi_saved_cr3)(%rip), %rcx - mov %rcx, %cr3 - leaq 0(%rip), %rcx - shr $32, %rcx /* splice the upper 32-bits of rip */ - shl $32, %rsp /* .. and the lower 32-bits of rsp */ - shrd $32, %rcx, %rsp /* to recover the full 64-bits of rsp */ pop %rcx + add $8, %rsp + pop %r11 + pop %rsp + sysretq /* return from system call */ +/* End of double-mapped TEXT */ +.text + +Entry(ks_dispatch) + popq %rax + cmpl $(KERNEL64_CS), ISF64_CS(%rsp) + je EXT(ks_dispatch_kernel) + + mov %rax, %gs:CPU_UBER_TMP + mov %gs:CPU_UBER_ISF, %rax + add $(ISF64_SIZE), %rax -L_dispatch_continue: + xchg %rsp, %rax +/* Memory to memory moves (aint x86 wonderful): + * Transfer the exception frame from the per-CPU exception stack to the + * 'PCB' stack programmed at cswitch. + */ + push ISF64_SS(%rax) + push ISF64_RSP(%rax) + push ISF64_RFLAGS(%rax) + push ISF64_CS(%rax) + push ISF64_RIP(%rax) + push ISF64_ERR(%rax) + push ISF64_TRAPFN(%rax) + push ISF64_TRAPNO(%rax) + mov %gs:CPU_UBER_TMP, %rax + jmp EXT(ks_dispatch_user) + +Entry (ks_return) + jmp . + +Entry(ks_dispatch_user) cmpl $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP - je L_32bit_dispatch /* 32-bit user task */ - /* fall through to 64bit user dispatch */ + je L_dispatch_U32 /* 32-bit user task */ + +L_dispatch_U64: + subq $(ISS64_OFFSET), %rsp + mov %r15, R64_R15(%rsp) + mov %rsp, %r15 + mov %gs:CPU_KERNEL_STACK, %rsp + jmp L_dispatch_64bit + +Entry(ks_dispatch_kernel) + subq $(ISS64_OFFSET), %rsp + mov %r15, R64_R15(%rsp) + mov %rsp, %r15 /* * Here for 64-bit user task or kernel */ -L_64bit_dispatch: - subq $(ISS64_OFFSET), %rsp - movl $(SS_64), SS_FLAVOR(%rsp) +L_dispatch_64bit: + movl $(SS_64), SS_FLAVOR(%r15) - cld - /* * Save segment regs - for completeness since theyre not used. */ - mov %fs, R64_FS(%rsp) - mov %gs, R64_GS(%rsp) + movl %fs, R64_FS(%r15) + movl %gs, R64_GS(%r15) /* Save general-purpose registers */ - mov %rax, R64_RAX(%rsp) - mov %rcx, R64_RCX(%rsp) - mov %rbx, R64_RBX(%rsp) - mov %rbp, R64_RBP(%rsp) - mov %r11, R64_R11(%rsp) - mov %r12, R64_R12(%rsp) - mov %r13, R64_R13(%rsp) - mov %r14, R64_R14(%rsp) - mov %r15, R64_R15(%rsp) + mov %rax, R64_RAX(%r15) + mov %rbx, R64_RBX(%r15) + mov %rcx, R64_RCX(%r15) + mov %rdx, R64_RDX(%r15) + mov %rbp, R64_RBP(%r15) + mov %rdi, R64_RDI(%r15) + mov %rsi, R64_RSI(%r15) + mov %r8, R64_R8(%r15) + mov %r9, R64_R9(%r15) + mov %r10, R64_R10(%r15) + mov %r11, R64_R11(%r15) + mov %r12, R64_R12(%r15) + mov %r13, R64_R13(%r15) + mov %r14, R64_R14(%r15) /* cr2 is significant only for page-faults */ mov %cr2, %rax - mov %rax, R64_CR2(%rsp) - - /* Other registers (which may contain syscall args) */ - mov %rdi, R64_RDI(%rsp) /* arg0 .. */ - mov %rsi, R64_RSI(%rsp) - mov %rdx, R64_RDX(%rsp) - mov %r10, R64_R10(%rsp) - mov %r8, R64_R8(%rsp) - mov %r9, R64_R9(%rsp) /* .. arg5 */ + mov %rax, R64_CR2(%r15) - mov R64_TRAPNO(%rsp), %ebx /* %ebx := trapno for later */ - mov R64_TRAPFN(%rsp), %rdx /* %rdx := trapfn for later */ - mov R64_CS(%rsp), %esi /* %esi := cs for later */ + mov R64_TRAPNO(%r15), %ebx /* %ebx := trapno for later */ + mov R64_TRAPFN(%r15), %rdx /* %rdx := trapfn for later */ + mov R64_CS(%r15), %esi /* %esi := cs for later */ - jmp L_common_dispatch + jmp L_common_dispatch L_64bit_entry_reject: /* * Here for a 64-bit user attempting an invalid kernel entry. */ - pushq %rax - leaq HNDL_ALLTRAPS(%rip), %rax - movq %rax, ISF64_TRAPFN+8(%rsp) - popq %rax + movq $(HNDL_ALLTRAPS), ISF64_TRAPFN(%rsp) movq $(T_INVALID_OPCODE), ISF64_TRAPNO(%rsp) - jmp L_64bit_dispatch + jmp L_dispatch_U64 -L_32bit_entry_check: +Entry(ks_32bit_entry_check) /* * Check we're not a confused 64-bit user. */ @@ -232,103 +500,114 @@ L_32bit_entry_check: jne L_64bit_entry_reject /* fall through to 32-bit handler: */ -L_32bit_dispatch: /* 32-bit user task */ - subq $(ISC32_OFFSET), %rsp - movl $(SS_32), SS_FLAVOR(%rsp) +L_dispatch_U32: /* 32-bit user task */ + subq $(ISS64_OFFSET), %rsp + mov %rsp, %r15 + mov %gs:CPU_KERNEL_STACK, %rsp + movl $(SS_32), SS_FLAVOR(%r15) - cld /* * Save segment regs */ - mov %ds, R32_DS(%rsp) - mov %es, R32_ES(%rsp) - mov %fs, R32_FS(%rsp) - mov %gs, R32_GS(%rsp) + movl %ds, R32_DS(%r15) + movl %es, R32_ES(%r15) + movl %fs, R32_FS(%r15) + movl %gs, R32_GS(%r15) /* * Save general 32-bit registers */ - mov %eax, R32_EAX(%rsp) - mov %ebx, R32_EBX(%rsp) - mov %ecx, R32_ECX(%rsp) - mov %edx, R32_EDX(%rsp) - mov %ebp, R32_EBP(%rsp) - mov %esi, R32_ESI(%rsp) - mov %edi, R32_EDI(%rsp) + mov %eax, R32_EAX(%r15) + mov %ebx, R32_EBX(%r15) + mov %ecx, R32_ECX(%r15) + mov %edx, R32_EDX(%r15) + mov %ebp, R32_EBP(%r15) + mov %esi, R32_ESI(%r15) + mov %edi, R32_EDI(%r15) /* Unconditionally save cr2; only meaningful on page faults */ mov %cr2, %rax - mov %eax, R32_CR2(%rsp) + mov %eax, R32_CR2(%r15) /* * Copy registers already saved in the machine state * (in the interrupt stack frame) into the compat save area. */ - mov ISC32_RIP(%rsp), %eax - mov %eax, R32_EIP(%rsp) - mov ISC32_RFLAGS(%rsp), %eax - mov %eax, R32_EFLAGS(%rsp) - mov ISC32_CS(%rsp), %esi /* %esi := %cs for later */ - - mov %esi, R32_CS(%rsp) - mov ISC32_RSP(%rsp), %eax - mov %eax, R32_UESP(%rsp) - mov ISC32_SS(%rsp), %eax - mov %eax, R32_SS(%rsp) -L_32bit_dispatch_after_fault: - mov ISC32_TRAPNO(%rsp), %ebx /* %ebx := trapno for later */ - mov %ebx, R32_TRAPNO(%rsp) - mov ISC32_ERR(%rsp), %eax - mov %eax, R32_ERR(%rsp) - mov ISC32_TRAPFN(%rsp), %rdx /* %rdx := trapfn for later */ + mov R64_RIP(%r15), %eax + mov %eax, R32_EIP(%r15) + mov R64_RFLAGS(%r15), %eax + mov %eax, R32_EFLAGS(%r15) + mov R64_RSP(%r15), %eax + mov %eax, R32_UESP(%r15) + mov R64_SS(%r15), %eax + mov %eax, R32_SS(%r15) +L_dispatch_U32_after_fault: + mov R64_CS(%r15), %esi /* %esi := %cs for later */ + mov %esi, R32_CS(%r15) + mov R64_TRAPNO(%r15), %ebx /* %ebx := trapno for later */ + mov %ebx, R32_TRAPNO(%r15) + mov R64_ERR(%r15), %eax + mov %eax, R32_ERR(%r15) + mov R64_TRAPFN(%r15), %rdx /* %rdx := trapfn for later */ L_common_dispatch: + cld /* Ensure the direction flag is clear in the kernel */ + cmpl $0, EXT(pmap_smap_enabled)(%rip) + je 1f + clac /* Clear EFLAGS.AC if SMAP is present/enabled */ +1: /* - * On entering the kernel, we don't need to switch cr3 + * On entering the kernel, we typically don't switch CR3 * because the kernel shares the user's address space. - * But we mark the kernel's cr3 as "active". - * If, however, the invalid cr3 flag is set, we have to flush tlbs - * since the kernel's mapping was changed while we were in userspace. + * But we mark the kernel's cr3 as "active" for TLB coherency evaluation + * If, however, the CPU's invalid TLB flag is set, we have to invalidate the TLB + * since the kernel pagetables were changed while we were in userspace. * - * But: if global no_shared_cr3 is TRUE we do switch to the kernel's cr3 + * For threads with a mapped pagezero (some WINE games) on non-SMAP platforms, + * we switch to the kernel's address space on entry. Also, + * if the global no_shared_cr3 is TRUE we do switch to the kernel's cr3 * so that illicit accesses to userspace can be trapped. */ mov %gs:CPU_KERNEL_CR3, %rcx mov %rcx, %gs:CPU_ACTIVE_CR3 test $3, %esi /* user/kernel? */ - jz 1f /* skip cr3 reload from kernel */ + jz 2f /* skip cr3 reload from kernel */ xor %rbp, %rbp + cmpl $0, %gs:CPU_PAGEZERO_MAPPED + jnz 11f cmpl $0, EXT(no_shared_cr3)(%rip) - je 1f + je 2f +11: + xor %eax, %eax + movw %gs:CPU_KERNEL_PCID, %ax + or %rax, %rcx mov %rcx, %cr3 /* load kernel cr3 */ - jmp 2f /* and skip tlb flush test */ -1: + jmp 4f /* and skip tlb flush test */ +2: mov %gs:CPU_ACTIVE_CR3+4, %rcx shr $32, %rcx testl %ecx, %ecx - jz 2f + jz 4f movl $0, %gs:CPU_TLB_INVALID - testl $(1<<16), %ecx /* Global? */ - jz 11f mov %cr4, %rcx /* RMWW CR4, for lack of an alternative*/ and $(~CR4_PGE), %rcx mov %rcx, %cr4 or $(CR4_PGE), %rcx mov %rcx, %cr4 - jmp 2f - -11: mov %cr3, %rcx - mov %rcx, %cr3 -2: +4: mov %gs:CPU_ACTIVE_THREAD, %rcx /* Get the active thread */ + testq %rcx, %rcx + je 5f + movl $-1, TH_IOTIER_OVERRIDE(%rcx) /* Reset IO tier override to -1 before handling trap */ cmpq $0, TH_PCB_IDS(%rcx) /* Is there a debug register state? */ - je 3f - mov $0, %rcx /* If so, reset DR7 (the control) */ + je 5f + xor %ecx, %ecx /* If so, reset DR7 (the control) */ mov %rcx, %dr7 -3: +5: incl %gs:hwIntCnt(,%ebx,4) // Bump the trap/intr count /* Dispatch the designated handler */ - jmp *%rdx + leaq EXT(idt64_hndl_table1)(%rip), %rax + jmp *(%rax, %rdx, 8) /* * Control is passed here to return to user. @@ -341,7 +620,7 @@ Entry(ret_to_user) mov %gs:CPU_ACTIVE_THREAD, %rdx movq TH_PCB_IDS(%rdx),%rax /* Obtain this thread's debug state */ - cmpq $0,%rax /* Is there a debug register context? */ + test %rax, %rax /* Is there a debug register context? */ je 2f /* branch if not */ cmpl $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP /* Are we a 32-bit task? */ jne 1f @@ -369,68 +648,116 @@ Entry(ret_to_user) mov %rcx, %gs:CPU_DR7 2: /* - * On exiting the kernel there's no need to switch cr3 since we're + * On exiting the kernel there's typically no need to switch cr3 since we're * already running in the user's address space which includes the - * kernel. Nevertheless, we now mark the task's cr3 as active. - * But, if no_shared_cr3 is set, we do need to switch cr3 at this point. + * kernel. We now mark the task's cr3 as active, for TLB coherency. + * If the target address space has a pagezero mapping present, or + * if no_shared_cr3 is set, we do need to switch cr3 at this point. */ mov %gs:CPU_TASK_CR3, %rcx mov %rcx, %gs:CPU_ACTIVE_CR3 + cmpl $0, %gs:CPU_PAGEZERO_MAPPED + jnz L_cr3_switch_island movl EXT(no_shared_cr3)(%rip), %eax test %eax, %eax /* -no_shared_cr3 */ - jz 3f - mov %rcx, %cr3 -3: + jnz L_cr3_switch_island + +L_cr3_switch_return: mov %gs:CPU_DR7, %rax /* Is there a debug control register?*/ cmp $0, %rax je 4f mov %rax, %dr7 /* Set DR7 */ movq $0, %gs:CPU_DR7 4: - cmpl $(SS_64), SS_FLAVOR(%rsp) /* 64-bit state? */ + cmpl $(SS_64), SS_FLAVOR(%r15) /* 64-bit state? */ je L_64bit_return L_32bit_return: #if DEBUG_IDT64 - cmpl $(SS_32), SS_FLAVOR(%rsp) /* 32-bit state? */ + cmpl $(SS_32), SS_FLAVOR(%r15) /* 32-bit state? */ je 1f cli POSTCODE2(0x6432) - CCALL1(panic_idt64, %rsp) + CCALL1(panic_idt64, %r15) 1: #endif /* DEBUG_IDT64 */ /* * Restore registers into the machine state for iret. + * Here on fault stack and PCB address in R11. */ - movl R32_EIP(%rsp), %eax - movl %eax, ISC32_RIP(%rsp) - movl R32_EFLAGS(%rsp), %eax - movl %eax, ISC32_RFLAGS(%rsp) - movl R32_CS(%rsp), %eax - movl %eax, ISC32_CS(%rsp) - movl R32_UESP(%rsp), %eax - movl %eax, ISC32_RSP(%rsp) - movl R32_SS(%rsp), %eax - movl %eax, ISC32_SS(%rsp) - + movl R32_EIP(%r15), %eax + movl %eax, R64_RIP(%r15) + movl R32_EFLAGS(%r15), %eax + movl %eax, R64_RFLAGS(%r15) + movl R32_CS(%r15), %eax + movl %eax, R64_CS(%r15) + movl R32_UESP(%r15), %eax + movl %eax, R64_RSP(%r15) + movl R32_SS(%r15), %eax + movl %eax, R64_SS(%r15) + + /* Validate DS/ES/FS/GS segment selectors with the Load Access Rights instruction prior to restoration */ + /* Exempt "known good" statically configured selectors, e.g. USER_DS and 0 */ + cmpl $(USER_DS), R32_DS(%r15) + jz 22f + cmpl $0, R32_DS(%r15) + jz 22f + larw R32_DS(%r15), %ax + jz 22f + movl $(USER_DS), R32_DS(%r15) +22: + cmpl $(USER_DS), R32_ES(%r15) + jz 33f + cmpl $0, R32_ES(%r15) + jz 33f + larw R32_ES(%r15), %ax + jz 33f + movl $(USER_DS), R32_ES(%r15) +33: + cmpl $(USER_DS), R32_FS(%r15) + jz 44f + cmpl $0, R32_FS(%r15) + jz 44f + larw R32_FS(%r15), %ax + jz 44f + movl $(USER_DS), R32_FS(%r15) +44: + cmpl $(USER_CTHREAD), R32_GS(%r15) + jz 55f + cmpl $0, R32_GS(%r15) + jz 55f + larw R32_GS(%r15), %ax + jz 55f + movl $(USER_CTHREAD), R32_GS(%r15) +55: /* * Restore general 32-bit registers */ - movl R32_EAX(%rsp), %eax - movl R32_EBX(%rsp), %ebx - movl R32_ECX(%rsp), %ecx - movl R32_EDX(%rsp), %edx - movl R32_EBP(%rsp), %ebp - movl R32_ESI(%rsp), %esi - movl R32_EDI(%rsp), %edi + movl R32_EAX(%r15), %eax + movl R32_EBX(%r15), %ebx + movl R32_ECX(%r15), %ecx + movl R32_EDX(%r15), %edx + movl R32_EBP(%r15), %ebp + movl R32_ESI(%r15), %esi + movl R32_EDI(%r15), %edi /* - * Restore segment registers. We make take an exception here but - * we've got enough space left in the save frame area to absorb - * a hardware frame plus the trapfn and trapno + * Restore segment registers. A segment exception taken here will + * push state on the IST1 stack and will not affect the "PCB stack". */ + mov %r15, %rsp /* Set the PCB as the stack */ swapgs + + xor %r8, %r8 + xor %r9, %r9 + xor %r10, %r10 + xor %r11, %r11 + xor %r12, %r12 + xor %r13, %r13 + xor %r14, %r14 + xor %r15, %r15 + EXT(ret32_set_ds): movw R32_DS(%rsp), %ds EXT(ret32_set_es): @@ -441,12 +768,12 @@ EXT(ret32_set_gs): movw R32_GS(%rsp), %gs /* pop compat frame + trapno, trapfn and error */ - add $(ISC32_OFFSET)+8+8+8, %rsp - cmp $(SYSENTER_CS),ISF64_CS-8-8-8(%rsp) + add $(ISS64_OFFSET)+8+8+8, %rsp + cmpl $(SYSENTER_CS),ISF64_CS-8-8-8(%rsp) /* test for fast entry/exit */ - je L_fast_exit + je L_fast_exit EXT(ret32_iret): - iretq /* return from interrupt */ + iretq /* return from interrupt */ L_fast_exit: pop %rdx /* user return eip */ @@ -455,204 +782,40 @@ L_fast_exit: popf /* flags - carry denotes failure */ pop %rcx /* user return esp */ sti /* interrupts enabled after sysexit */ - sysexit /* 32-bit sysexit */ + sysexitl /* 32-bit sysexit */ + +L_cr3_switch_island: + xor %eax, %eax + movw %gs:CPU_ACTIVE_PCID, %ax + or %rax, %rcx + mov %rcx, %cr3 + jmp L_cr3_switch_return ret_to_kernel: #if DEBUG_IDT64 - cmpl $(SS_64), SS_FLAVOR(%rsp) /* 64-bit state? */ + cmpl $(SS_64), SS_FLAVOR(%r15) /* 64-bit state? */ je 1f cli POSTCODE2(0x6464) - CCALL1(panic_idt64, %rsp) + CCALL1(panic_idt64, %r15) hlt 1: - cmpl $(KERNEL64_CS), R64_CS(%rsp) + cmpl $(KERNEL64_CS), R64_CS(%r15) je 2f - CCALL1(panic_idt64, %rsp) + CCALL1(panic_idt64, %r15) hlt 2: #endif L_64bit_return: - testb $3, R64_CS(%rsp) /* returning to user-space? */ - jz 1f - swapgs -1: - - /* - * Restore general 64-bit registers - */ - mov R64_R15(%rsp), %r15 - mov R64_R14(%rsp), %r14 - mov R64_R13(%rsp), %r13 - mov R64_R12(%rsp), %r12 - mov R64_R11(%rsp), %r11 - mov R64_R10(%rsp), %r10 - mov R64_R9(%rsp), %r9 - mov R64_R8(%rsp), %r8 - mov R64_RSI(%rsp), %rsi - mov R64_RDI(%rsp), %rdi - mov R64_RBP(%rsp), %rbp - mov R64_RDX(%rsp), %rdx - mov R64_RBX(%rsp), %rbx - mov R64_RCX(%rsp), %rcx - mov R64_RAX(%rsp), %rax - - add $(ISS64_OFFSET)+24, %rsp /* pop saved state frame + - trapno + trapfn and error */ - cmpl $(SYSCALL_CS),ISF64_CS-24(%rsp) - /* test for fast entry/exit */ - je L_sysret -.globl _dump_iretq -EXT(ret64_iret): - iretq /* return from interrupt */ - -L_sysret: - /* - * Here to load rcx/r11/rsp and perform the sysret back to user-space. - * rcx user rip - * r1 user rflags - * rsp user stack pointer - */ - mov ISF64_RIP-24(%rsp), %rcx - mov ISF64_RFLAGS-24(%rsp), %r11 - mov ISF64_RSP-24(%rsp), %rsp - sysretq /* return from systen call */ - - - -/* - * System call handlers. - * These are entered via a syscall interrupt. The system call number in %rax - * is saved to the error code slot in the stack frame. We then branch to the - * common state saving code. - */ - -#ifndef UNIX_INT -#error NO UNIX INT!!! -#endif -Entry(idt64_unix_scall) - swapgs /* switch to kernel gs (cpu_data) */ - pushq %rax /* save system call number */ - PUSH_FUNCTION(HNDL_UNIX_SCALL) - pushq $(UNIX_INT) - jmp L_32bit_entry_check - - -Entry(idt64_mach_scall) - swapgs /* switch to kernel gs (cpu_data) */ - pushq %rax /* save system call number */ - PUSH_FUNCTION(HNDL_MACH_SCALL) - pushq $(MACH_INT) - jmp L_32bit_entry_check - - -Entry(idt64_mdep_scall) - swapgs /* switch to kernel gs (cpu_data) */ - pushq %rax /* save system call number */ - PUSH_FUNCTION(HNDL_MDEP_SCALL) - pushq $(MACHDEP_INT) - jmp L_32bit_entry_check - - -Entry(idt64_diag_scall) - swapgs /* switch to kernel gs (cpu_data) */ - push %rax /* save system call number */ - PUSH_FUNCTION(HNDL_DIAG_SCALL) - pushq $(DIAG_INT) - jmp L_32bit_entry_check - -Entry(hi64_syscall) -Entry(idt64_syscall) -L_syscall_continue: - swapgs /* Kapow! get per-cpu data area */ - mov %rsp, %gs:CPU_UBER_TMP /* save user stack */ - mov %gs:CPU_UBER_ISF, %rsp /* switch stack to pcb */ - /* - * Save values in the ISF frame in the PCB - * to cons up the saved machine state. + * Restore general 64-bit registers. + * Here on fault stack and PCB address in R15. */ - movl $(USER_DS), ISF64_SS(%rsp) - movl $(SYSCALL_CS), ISF64_CS(%rsp) /* cs - a pseudo-segment */ - mov %r11, ISF64_RFLAGS(%rsp) /* rflags */ - mov %rcx, ISF64_RIP(%rsp) /* rip */ - mov %gs:CPU_UBER_TMP, %rcx - mov %rcx, ISF64_RSP(%rsp) /* user stack */ - mov %rax, ISF64_ERR(%rsp) /* err/rax - syscall code */ - movq $(T_SYSCALL), ISF64_TRAPNO(%rsp) /* trapno */ - leaq HNDL_SYSCALL(%rip), %r11; - movq %r11, ISF64_TRAPFN(%rsp) - jmp L_64bit_dispatch /* this can only be a 64-bit task */ - -/* - * sysenter entry point - * Requires user code to set up: - * edx: user instruction pointer (return address) - * ecx: user stack pointer - * on which is pushed stub ret addr and saved ebx - * Return to user-space is made using sysexit. - * Note: sysenter/sysexit cannot be used for calls returning a value in edx, - * or requiring ecx to be preserved. - */ -Entry(hi64_sysenter) -Entry(idt64_sysenter) - movq (%rsp), %rsp - /* - * Push values on to the PCB stack - * to cons up the saved machine state. - */ - push $(USER_DS) /* ss */ - push %rcx /* uesp */ - pushf /* flags */ - /* - * Clear, among others, the Nested Task (NT) flags bit; - * this is zeroed by INT, but not by SYSENTER. - */ - push $0 - popf - push $(SYSENTER_CS) /* cs */ -L_sysenter_continue: - swapgs /* switch to kernel gs (cpu_data) */ - push %rdx /* eip */ - push %rax /* err/eax - syscall code */ - PUSH_FUNCTION(HNDL_SYSENTER) - pushq $(T_SYSENTER) - orl $(EFL_IF), ISF64_RFLAGS(%rsp) - jmp L_32bit_entry_check - - -Entry(idt64_page_fault) - PUSH_FUNCTION(HNDL_ALLTRAPS) - push $(T_PAGE_FAULT) - push %rax /* save %rax temporarily */ - leaq EXT(idt64_unix_scall_copy_args)(%rip), %rax - cmp %rax, 8+ISF64_RIP(%rsp) /* fault during copy args? */ - je 1f /* - yes, handle copy arg fault */ - testb $3, 8+ISF64_CS(%rsp) /* was trap from kernel? */ - jz L_kernel_trap /* - yes, handle with care */ - pop %rax /* restore %rax, swapgs, and continue */ - swapgs - jmp L_dispatch_continue -1: - add $(8+ISF64_SIZE), %rsp /* remove entire intr stack frame */ - jmp L_copy_args_continue /* continue system call entry */ - - -/* - * Debug trap. Check for single-stepping across system call into - * kernel. If this is the case, taking the debug trap has turned - * off single-stepping - save the flags register with the trace - * bit set. - */ -Entry(idt64_debug) - push $0 /* error code */ - PUSH_FUNCTION(HNDL_ALLTRAPS) - pushq $(T_DEBUG) - - testb $3, ISF64_CS(%rsp) - jnz L_dispatch + leaq EXT(idt64_hndl_table0)(%rip), %rax + jmp *8(%rax) +Entry(ks_idt64_debug_kernel) /* * trap came from kernel mode */ @@ -661,7 +824,7 @@ Entry(idt64_debug) lea EXT(idt64_sysenter)(%rip), %rax cmp %rax, ISF64_RIP+8(%rsp) pop %rax - jne L_dispatch + jne EXT(ks_dispatch_kernel) /* * Interrupt stack frame has been pushed on the temporary stack. * We have to switch to pcb stack and patch up the saved state. @@ -676,59 +839,39 @@ Entry(idt64_debug) mov ISF64_ERR(%rcx),%rcx /* restore %rcx */ jmp L_sysenter_continue /* continue sysenter entry */ +Entry(ks_trap_check_kernel_exit) + testb $3,ISF64_CS(%rsp) + jz L_kernel_gpf -Entry(idt64_double_fault) - PUSH_FUNCTION(HNDL_DOUBLE_FAULT) - pushq $(T_DOUBLE_FAULT) - + /* Here for fault from user-space. Copy interrupt state to PCB. */ + swapgs push %rax - leaq EXT(idt64_syscall)(%rip), %rax - cmp %rax, ISF64_RIP+8(%rsp) + mov %rcx, %gs:CPU_UBER_TMP /* save user RCX */ + mov %gs:CPU_UBER_ISF, %rcx /* PCB stack addr */ + mov ISF64_SS+8(%rsp), %rax + mov %rax, ISF64_SS(%rcx) + mov ISF64_RSP+8(%rsp), %rax + mov %rax, ISF64_RSP(%rcx) + mov ISF64_RFLAGS+8(%rsp), %rax + mov %rax, ISF64_RFLAGS(%rcx) + mov ISF64_CS+8(%rsp), %rax + mov %rax, ISF64_CS(%rcx) + mov ISF64_RIP+8(%rsp), %rax + mov %rax, ISF64_RIP(%rcx) + mov ISF64_ERR+8(%rsp), %rax + mov %rax, ISF64_ERR(%rcx) + mov ISF64_TRAPFN+8(%rsp), %rax + mov %rax, ISF64_TRAPFN(%rcx) + mov ISF64_TRAPNO+8(%rsp), %rax + mov %rax, ISF64_TRAPNO(%rcx) pop %rax - jne L_64bit_dispatch - - mov ISF64_RSP(%rsp), %rsp - jmp L_syscall_continue - + mov %gs:CPU_UBER_TMP, %rsp /* user RCX into RSP */ + xchg %rcx, %rsp /* to PCB stack with user RCX */ + jmp EXT(ks_dispatch_user) -/* - * General protection or segment-not-present fault. - * Check for a GP/NP fault in the kernel_return - * sequence; if there, report it as a GP/NP fault on the user's instruction. - * - * rsp-> 0 ISF64_TRAPNO: trap code (NP or GP) - * 8 ISF64_TRAPFN: trap function - * 16 ISF64_ERR: segment number in error (error code) - * 24 ISF64_RIP: rip - * 32 ISF64_CS: cs - * 40 ISF64_RFLAGS: rflags - * 48 ISF64_RIP: rsp - * 56 ISF64_SS: ss - * 64: old registers (trap is from kernel) - */ -Entry(idt64_gen_prot) - PUSH_FUNCTION(HNDL_ALLTRAPS) - pushq $(T_GENERAL_PROTECTION) - jmp trap_check_kernel_exit /* check for kernel exit sequence */ - -Entry(idt64_stack_fault) - PUSH_FUNCTION(HNDL_ALLTRAPS) - pushq $(T_STACK_FAULT) - jmp trap_check_kernel_exit /* check for kernel exit sequence */ - -Entry(idt64_segnp) - PUSH_FUNCTION(HNDL_ALLTRAPS) - pushq $(T_SEGMENT_NOT_PRESENT) - /* indicate fault type */ -trap_check_kernel_exit: - testb $3,ISF64_CS(%rsp) - jnz L_dispatch - /* - * trap was from kernel mode, - * so check for the kernel exit sequence - */ +L_kernel_gpf: + /* Here for GPF from kernel_space. Check for recoverable cases. */ push %rax - leaq EXT(ret32_iret)(%rip), %rax cmp %rax, 8+ISF64_RIP(%rsp) je L_fault_iret @@ -747,34 +890,35 @@ trap_check_kernel_exit: leaq EXT(ret32_set_gs)(%rip), %rax cmp %rax, 8+ISF64_RIP(%rsp) je L_32bit_fault_set_seg + jmp EXT(ks_kernel_trap) + /* Fall through */ - leaq EXT(idt64_unix_scall_copy_args)(%rip), %rax - cmp %rax, 8+ISF64_RIP(%rsp) - cmove 8+ISF64_RSP(%rsp), %rsp - je L_copy_args_continue - - /* fall through */ - -L_kernel_trap: +Entry(ks_kernel_trap) /* * Here after taking an unexpected trap from kernel mode - perhaps * while running in the trampolines hereabouts. * Note: %rax has been pushed on stack. * Make sure we're not on the PCB stack, if so move to the kernel stack. * This is likely a fatal condition. - * But first, try to ensure we have the kernel gs base active... + * But first, ensure we have the kernel gs base active... */ - movq %gs:CPU_THIS, %rax /* get gs_base into %rax */ - test %rax, %rax /* test sign bit (MSB) */ - js 1f /* -ve kernel addr, no swap */ - swapgs /* +ve user addr, swap */ + push %rcx + push %rdx + mov $(MSR_IA32_GS_BASE), %ecx + rdmsr /* read kernel gsbase */ + test $0x80000000, %edx /* test MSB of address */ + jne 1f + swapgs /* so swap */ 1: + pop %rdx + pop %rcx + movq %gs:CPU_UBER_ISF, %rax /* PCB stack addr */ subq %rsp, %rax cmpq $(PAGE_SIZE), %rax /* current stack in PCB? */ jb 2f /* - yes, deal with it */ pop %rax /* - no, restore %rax */ - jmp L_64bit_dispatch + jmp EXT(ks_dispatch_kernel) 2: /* * Here if %rsp is in the PCB @@ -791,49 +935,74 @@ L_kernel_trap: pushq 8+ISF64_TRAPFN(%rax) pushq 8+ISF64_TRAPNO(%rax) movq (%rax), %rax - jmp L_64bit_dispatch + jmp EXT(ks_dispatch_kernel) + /* * GP/NP fault on IRET: CS or SS is in error. - * Note that the user ss is originally 16-byte aligned, we'd popped the - * stack back to contain just the rip/cs/rflags/rsp/ss before issuing the iret. - * On taking the GP/NP fault on the iret instruction, the stack is 16-byte - * aligned before pushed the interrupt frame. Hence, an 8-byte padding exists. + * User GSBASE is active. + * On IST1 stack containing: + * (rax saved above, which is immediately popped) + * 0 ISF64_TRAPNO: trap code (NP or GP) + * 8 ISF64_TRAPFN: trap function + * 16 ISF64_ERR: segment number in error (error code) + * 24 ISF64_RIP: kernel RIP + * 32 ISF64_CS: kernel CS + * 40 ISF64_RFLAGS: kernel RFLAGS + * 48 ISF64_RSP: kernel RSP + * 56 ISF64_SS: kernel SS + * On the PCB stack, pointed to by the kernel's RSP is: + * 0 user RIP + * 8 user CS + * 16 user RFLAGS + * 24 user RSP + * 32 user SS * - * on SP is - * (- rax saved above, which is immediately popped) + * We need to move the kernel's TRAPNO, TRAPFN and ERR to the PCB and handle + * as a user fault with: * 0 ISF64_TRAPNO: trap code (NP or GP) * 8 ISF64_TRAPFN: trap function * 16 ISF64_ERR: segment number in error (error code) - * 24 ISF64_RIP: rip - * 32 ISF64_CS: cs - * 40 ISF64_RFLAGS: rflags - * 48 ISF64_RSP: rsp --> new trapno - * 56 ISF64_SS: ss --> new trapfn - * 64 pad --> new errcode - * 72 user rip - * 80 user cs - * 88 user rflags - * 96 user rsp - * 104 user ss (16-byte aligned) + * 24 user RIP + * 32 user CS + * 40 user RFLAGS + * 48 user RSP + * 56 user SS */ L_fault_iret: pop %rax /* recover saved %rax */ mov %rax, ISF64_RIP(%rsp) /* save rax (we don`t need saved rip) */ - mov ISF64_TRAPNO(%rsp), %rax - mov %rax, ISF64_TRAPNO(%rsp)/* put in user trap number */ - mov ISF64_TRAPFN(%rsp), %rax - mov %rax, ISF64_SS(%rsp) /* put in user trap function */ - mov ISF64_ERR(%rsp), %rax /* get error code */ - mov %rax, 8+ISF64_SS(%rsp) /* put in user errcode */ - mov ISF64_RIP(%rsp), %rax /* restore rax */ - add $(ISF64_RSP),%rsp /* reset to new trapfn */ + mov ISF64_RSP(%rsp), %rax + xchg %rax, %rsp /* switch to PCB stack */ + push ISF64_ERR(%rax) + push ISF64_TRAPFN(%rax) + push ISF64_TRAPNO(%rax) + mov ISF64_RIP(%rax), %rax /* restore rax */ /* now treat as fault from user */ jmp L_dispatch /* * Fault restoring a segment register. All of the saved state is still * on the stack untouched since we haven't yet moved the stack pointer. + * On IST1 stack containing: + * (rax saved above, which is immediately popped) + * 0 ISF64_TRAPNO: trap code (NP or GP) + * 8 ISF64_TRAPFN: trap function + * 16 ISF64_ERR: segment number in error (error code) + * 24 ISF64_RIP: kernel RIP + * 32 ISF64_CS: kernel CS + * 40 ISF64_RFLAGS: kernel RFLAGS + * 48 ISF64_RSP: kernel RSP + * 56 ISF64_SS: kernel SS + * On the PCB stack, pointed to by the kernel's RSP is: + * 0 user trap code + * 8 user trap function + * 16 user err + * 24 user RIP + * 32 user CS + * 40 user RFLAGS + * 48 user RSP + * 56 user SS */ L_32bit_fault_set_seg: swapgs @@ -842,42 +1011,91 @@ L_32bit_fault_set_seg: mov ISF64_TRAPFN(%rsp), %rcx mov ISF64_ERR(%rsp), %rdx mov ISF64_RSP(%rsp), %rsp /* reset stack to saved state */ - mov %rax,ISC32_TRAPNO(%rsp) - mov %rcx,ISC32_TRAPFN(%rsp) - mov %rdx,ISC32_ERR(%rsp) + mov %rax,R64_TRAPNO(%rsp) + mov %rcx,R64_TRAPFN(%rsp) + mov %rdx,R64_ERR(%rsp) /* now treat as fault from user */ /* except that all the state is */ /* already saved - we just have to */ /* move the trapno and error into */ /* the compatibility frame */ - jmp L_32bit_dispatch_after_fault + jmp L_dispatch_U32_after_fault -/* - * Fatal exception handlers: - */ -Entry(idt64_db_task_dbl_fault) - PUSH_FUNCTION(HNDL_DOUBLE_FAULT) - pushq $(T_DOUBLE_FAULT) - jmp L_dispatch +Entry(ks_idt64_nmi_kernel) + /* From user-space: copy interrupt state to user PCB */ + swapgs + mov %gs:CPU_UBER_ISF, %rcx /* PCB stack addr */ + add $(ISF64_SIZE), %rcx /* adjust to base of ISF */ + swapgs /* swap back for L_dispatch */ + jmp 4f /* Copy state to PCB */ -Entry(idt64_db_task_stk_fault) - PUSH_FUNCTION(HNDL_DOUBLE_FAULT) - pushq $(T_STACK_FAULT) - jmp L_dispatch +1: + /* + * From kernel-space: + * Determine whether the kernel or user GS is set. + * Set the kernel and ensure that we'll swap back correctly at IRET. + */ + mov $(MSR_IA32_GS_BASE), %ecx + rdmsr /* read kernel gsbase */ + test $0x80000000, %edx /* test MSB of address */ + jne 2f + swapgs /* so swap */ + movl $1, ISF64_CS+4(%rsp) /* and set flag in CS slot */ +2: + /* + * Determine whether we're on the kernel or interrupt stack + * when the NMI hit. + */ + mov ISF64_RSP(%rsp), %rcx + mov %gs:CPU_KERNEL_STACK, %rax + xor %rcx, %rax + and EXT(kernel_stack_mask)(%rip), %rax + test %rax, %rax /* are we on the kernel stack? */ + je 3f /* yes */ + + mov %gs:CPU_INT_STACK_TOP, %rax + dec %rax /* intr stack top is byte above max */ + xor %rcx, %rax + and EXT(kernel_stack_mask)(%rip), %rax + test %rax, %rax /* are we on the interrupt stack? */ + je 3f /* yes */ + + mov %gs:CPU_KERNEL_STACK, %rcx +3: + /* 16-byte-align kernel/interrupt stack for state push */ + and $0xFFFFFFFFFFFFFFF0, %rcx -Entry(idt64_mc) - push $(0) /* Error */ - PUSH_FUNCTION(HNDL_MACHINE_CHECK) - pushq $(T_MACHINE_CHECK) - jmp L_dispatch +4: + /* + * Copy state from NMI stack (RSP) to the save area (RCX) which is + * the PCB for user or kernel/interrupt stack from kernel. + * ISF64_ERR(RSP) saved RAX + * ISF64_TRAPFN(RSP) saved RCX + * ISF64_TRAPNO(RSP) saved RDX + */ + xchg %rsp, %rcx /* set for pushes */ + push ISF64_SS(%rcx) + push ISF64_RSP(%rcx) + push ISF64_RFLAGS(%rcx) + push ISF64_CS(%rcx) + push ISF64_RIP(%rcx) + push $(0) /* error code 0 */ + push $(HNDL_ALLINTRS) /* trapfn allintrs */ + push $(T_NMI) /* trapno T_NMI */ + mov ISF64_ERR(%rcx), %rax + mov ISF64_TRAPNO(%rcx), %rdx + mov ISF64_TRAPFN(%rcx), %rcx + jmp L_dispatch -/* All 'exceptions' enter hndl_alltraps: - * rsp -> x86_saved_state_t - * esi cs at trap +/* All 'exceptions' enter hndl_alltraps, with: + * r15 x86_saved_state_t address + * rsp kernel stack if user-space, otherwise interrupt or kernel stack + * esi cs at trap * * The rest of the state is set up as: + * both rsp and r15 are 16-byte aligned * interrupts disabled * direction flag cleared */ @@ -890,61 +1108,58 @@ Entry(hndl_alltraps) /* Check for active vtimers in the current task */ mov %gs:CPU_ACTIVE_THREAD, %rcx + movl $-1, TH_IOTIER_OVERRIDE(%rcx) /* Reset IO tier override to -1 before handling trap/exception */ mov TH_TASK(%rcx), %rbx TASK_VTIMER_CHECK(%rbx, %rcx) - movq %rsp, %rdi /* also pass it as arg0 */ - movq %gs:CPU_KERNEL_STACK,%rsp /* switch to kernel stack */ - - CCALL(user_trap) /* call user trap routine */ + CCALL1(user_trap, %r15) /* call user trap routine */ /* user_trap() unmasks interrupts */ cli /* hold off intrs - critical section */ xorl %ecx, %ecx /* don't check if we're in the PFZ */ -#define CLI cli -#define STI sti Entry(return_from_trap) - movq %gs:CPU_ACTIVE_THREAD,%rsp - movq TH_PCB_ISS(%rsp), %rsp /* switch back to PCB stack */ + movq %gs:CPU_ACTIVE_THREAD,%r15 /* Get current thread */ + movl $-1, TH_IOTIER_OVERRIDE(%r15) /* Reset IO tier override to -1 before returning to userspace */ + cmpl $0, TH_RWLOCK_COUNT(%r15) /* Check if current thread has pending RW locks held */ + jz 1f + xorq %rbp, %rbp /* clear framepointer */ + mov %r15, %rdi /* Set RDI to current thread */ + CCALL(lck_rw_clear_promotions_x86) /* Clear promotions if needed */ +1: + movq TH_PCB_ISS(%r15), %r15 /* PCB stack */ movl %gs:CPU_PENDING_AST,%eax testl %eax,%eax - je EXT(return_to_user) /* branch if no AST */ + je EXT(return_to_user) /* branch if no AST */ L_return_from_trap_with_ast: - movq %rsp, %r13 - movq %gs:CPU_KERNEL_STACK, %rsp - testl %ecx, %ecx /* see if we need to check for an EIP in the PFZ */ je 2f /* no, go handle the AST */ - cmpl $(SS_64), SS_FLAVOR(%r13) /* are we a 64-bit task? */ + cmpl $(SS_64), SS_FLAVOR(%r15) /* are we a 64-bit task? */ je 1f /* no... 32-bit user mode */ - movl R32_EIP(%r13), %edi + movl R32_EIP(%r15), %edi xorq %rbp, %rbp /* clear framepointer */ CCALL(commpage_is_in_pfz32) testl %eax, %eax je 2f /* not in the PFZ... go service AST */ - movl %eax, R32_EBX(%r13) /* let the PFZ know we've pended an AST */ - movq %r13, %rsp /* switch back to PCB stack */ + movl %eax, R32_EBX(%r15) /* let the PFZ know we've pended an AST */ jmp EXT(return_to_user) 1: - movq R64_RIP(%r13), %rdi + movq R64_RIP(%r15), %rdi xorq %rbp, %rbp /* clear framepointer */ CCALL(commpage_is_in_pfz64) testl %eax, %eax je 2f /* not in the PFZ... go service AST */ - movl %eax, R64_RBX(%r13) /* let the PFZ know we've pended an AST */ - movq %r13, %rsp /* switch back to PCB stack */ + movl %eax, R64_RBX(%r15) /* let the PFZ know we've pended an AST */ jmp EXT(return_to_user) -2: - STI /* interrupts always enabled on return to user mode */ +2: - xor %edi, %edi /* zero %rdi */ xorq %rbp, %rbp /* clear framepointer */ - CCALL(i386_astintr) /* take the AST */ + CCALL(ast_taken_user) /* handle all ASTs (enables interrupts, may return via continuation) */ - CLI + cli + mov %rsp, %r15 /* AST changes stack, saved state */ xorl %ecx, %ecx /* don't check if we're in the PFZ */ jmp EXT(return_from_trap) /* and check again (rare) */ @@ -952,25 +1167,25 @@ L_return_from_trap_with_ast: * Trap from kernel mode. No need to switch stacks. * Interrupts must be off here - we will set them to state at time of trap * as soon as it's safe for us to do so and not recurse doing preemption + * */ -hndl_kerntrap: trap_from_kernel: - - movq %rsp, %rdi /* saved state addr */ - pushq R64_RIP(%rsp) /* Simulate a CALL from fault point */ + movq %r15, %rdi /* saved state addr */ + pushq R64_RIP(%r15) /* Simulate a CALL from fault point */ pushq %rbp /* Extend framepointer chain */ movq %rsp, %rbp CCALLWITHSP(kernel_trap) /* to kernel trap routine */ popq %rbp addq $8, %rsp + mov %rsp, %r15 /* DTrace slides stack/saved-state */ cli movl %gs:CPU_PENDING_AST,%eax /* get pending asts */ testl $(AST_URGENT),%eax /* any urgent preemption? */ je ret_to_kernel /* no, nothing to do */ - cmpl $(T_PREEMPT),R64_TRAPNO(%rsp) + cmpl $(T_PREEMPT),R64_TRAPNO(%r15) je ret_to_kernel /* T_PREEMPT handled in kernel_trap() */ - testl $(EFL_IF),R64_RFLAGS(%rsp) /* interrupts disabled? */ + testl $(EFL_IF),R64_RFLAGS(%r15) /* interrupts disabled? */ je ret_to_kernel cmpl $0,%gs:CPU_PREEMPTION_LEVEL /* preemption disabled? */ jne ret_to_kernel @@ -981,15 +1196,19 @@ trap_from_kernel: testq %rcx,%rcx /* are we on the kernel stack? */ jne ret_to_kernel /* no, skip it */ - CCALL1(i386_astintr, $1) /* take the AST */ + CCALL(ast_taken_kernel) /* take the AST */ + + mov %rsp, %r15 /* AST changes stack, saved state */ jmp ret_to_kernel /* * All interrupts on all tasks enter here with: - * rsp-> x86_saved_state_t + * r15 x86_saved_state_t + * rsp kernel or interrupt stack * esi cs at trap * + * both rsp and r15 are 16-byte aligned * interrupts disabled * direction flag cleared */ @@ -1010,9 +1229,9 @@ Entry(hndl_allintrs) orl $(CR0_TS),%eax /* or in TS bit */ mov %rax,%cr0 /* set cr0 */ - subq $8, %rsp /* for 16-byte stack alignment */ pushq %rcx /* save pointer to old stack */ - movq %rcx,%gs:CPU_INT_STATE /* save intr state */ + pushq %gs:CPU_INT_STATE /* save previous intr state */ + movq %r15,%gs:CPU_INT_STATE /* set intr state */ TIME_INT_ENTRY /* do timing */ @@ -1024,15 +1243,9 @@ Entry(hndl_allintrs) incl %gs:CPU_PREEMPTION_LEVEL incl %gs:CPU_INTERRUPT_LEVEL - movq %gs:CPU_INT_STATE, %rdi - - CCALL(interrupt) /* call generic interrupt routine */ - - cli /* just in case we returned with intrs enabled */ - xor %rax,%rax - movq %rax,%gs:CPU_INT_STATE /* clear intr state pointer */ + CCALL1(interrupt, %r15) /* call generic interrupt routine */ - .globl EXT(return_to_iret) +.globl EXT(return_to_iret) LEXT(return_to_iret) /* (label for kdb_kintr and hardclock) */ decl %gs:CPU_INTERRUPT_LEVEL @@ -1040,6 +1253,9 @@ LEXT(return_to_iret) /* (label for kdb_kintr and hardclock) */ TIME_INT_EXIT /* do timing */ + popq %gs:CPU_INT_STATE /* reset/clear intr state pointer */ + popq %rsp /* switch back to old stack */ + movq %gs:CPU_ACTIVE_THREAD,%rax movq TH_PCB_FPS(%rax),%rax /* get pcb's ifps */ cmpq $0,%rax /* Is there a context */ @@ -1054,24 +1270,22 @@ LEXT(return_to_iret) /* (label for kdb_kintr and hardclock) */ orl $(CR0_TS),%eax /* or in TS bit */ mov %rax,%cr0 /* set cr0 */ 2: - popq %rsp /* switch back to old stack */ - /* Load interrupted code segment into %eax */ - movl R32_CS(%rsp),%eax /* assume 32-bit state */ - cmpl $(SS_64),SS_FLAVOR(%rsp)/* 64-bit? */ + movl R32_CS(%r15),%eax /* assume 32-bit state */ + cmpl $(SS_64),SS_FLAVOR(%r15)/* 64-bit? */ #if DEBUG_IDT64 jne 4f - movl R64_CS(%rsp),%eax /* 64-bit user mode */ + movl R64_CS(%r15),%eax /* 64-bit user mode */ jmp 3f 4: - cmpl $(SS_32),SS_FLAVOR(%rsp) + cmpl $(SS_32),SS_FLAVOR(%r15) je 3f POSTCODE2(0x6431) - CCALL1(panic_idt64, %rsp) + CCALL1(panic_idt64, %r15) hlt #else jne 3f - movl R64_CS(%rsp),%eax /* 64-bit user mode */ + movl R64_CS(%r15),%eax /* 64-bit user mode */ #endif 3: testb $3,%al /* user mode, */ @@ -1088,20 +1302,14 @@ LEXT(return_to_iret) /* (label for kdb_kintr and hardclock) */ cmpl $0,%gs:CPU_PREEMPTION_LEVEL /* preemption disabled? */ jne ret_to_kernel /* yes, skip it */ - movq %gs:CPU_KERNEL_STACK,%rax - movq %rsp,%rcx - xorq %rax,%rcx - andq EXT(kernel_stack_mask)(%rip),%rcx - testq %rcx,%rcx /* are we on the kernel stack? */ - jne ret_to_kernel /* no, skip it */ - /* * Take an AST from kernel space. We don't need (and don't want) * to do as much as the case where the interrupt came from user * space. */ - CCALL1(i386_astintr, $1) + CCALL(ast_taken_kernel) + mov %rsp, %r15 /* AST changes stack, saved state */ jmp ret_to_kernel @@ -1112,17 +1320,18 @@ int_from_intstack: incl %gs:CPU_PREEMPTION_LEVEL incl %gs:CPU_INTERRUPT_LEVEL incl %gs:CPU_NESTED_ISTACK - mov %rsp, %rdi /* x86_saved_state */ - CCALL(interrupt) + + push %gs:CPU_INT_STATE + mov %r15, %gs:CPU_INT_STATE + + CCALL1(interrupt, %r15) + + pop %gs:CPU_INT_STATE decl %gs:CPU_INTERRUPT_LEVEL decl %gs:CPU_PREEMPTION_LEVEL decl %gs:CPU_NESTED_ISTACK -#if DEBUG_IDT64 - CCALL1(panic_idt64, %rsp) - POSTCODE2(0x6411) - hlt -#endif + jmp ret_to_kernel /* @@ -1146,7 +1355,10 @@ ast_from_interrupt_user: * 32bit Tasks * System call entries via INTR_GATE or sysenter: * - * rsp -> x86_saved_state32_t + * r15 x86_saved_state32_t + * rsp kernel stack + * + * both rsp and r15 are 16-byte aligned * interrupts disabled * direction flag cleared */ @@ -1156,38 +1368,15 @@ Entry(hndl_sysenter) * We can be here either for a mach syscall or a unix syscall, * as indicated by the sign of the code: */ - movl R32_EAX(%rsp),%eax + movl R32_EAX(%r15),%eax testl %eax,%eax js EXT(hndl_mach_scall) /* < 0 => mach */ /* > 0 => unix */ Entry(hndl_unix_scall) -/* If the caller (typically LibSystem) has recorded the cumulative size of - * the arguments in EAX, copy them over from the user stack directly. - * We recover from exceptions inline--if the copy loop doesn't complete - * due to an exception, we fall back to copyin from compatibility mode. - * We can potentially extend this mechanism to mach traps as well (DRK). - */ - testl $(I386_SYSCALL_ARG_BYTES_MASK), %eax - jz L_copy_args_continue - movl %eax, %ecx - mov %gs:CPU_UBER_ARG_STORE_VALID, %rbx - shrl $(I386_SYSCALL_ARG_DWORDS_SHIFT), %ecx - andl $(I386_SYSCALL_ARG_DWORDS_MASK), %ecx - mov %gs:CPU_UBER_ARG_STORE, %rdi - mov ISC32_RSP(%rsp), %rsi - add $4, %rsi - movl $0, (%rbx) - -EXT(idt64_unix_scall_copy_args): - rep movsl - movl $1, (%rbx) -L_copy_args_continue: TIME_TRAP_UENTRY - movq %gs:CPU_KERNEL_STACK,%rdi - xchgq %rdi,%rsp /* switch to kernel stack */ movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ movq TH_TASK(%rcx),%rbx /* point to current task */ incl TH_SYSCALLS_UNIX(%rcx) /* increment call count */ @@ -1197,7 +1386,7 @@ L_copy_args_continue: sti - CCALL(unix_syscall) + CCALL1(unix_syscall, %r15) /* * always returns through thread_exception_return */ @@ -1206,8 +1395,6 @@ L_copy_args_continue: Entry(hndl_mach_scall) TIME_TRAP_UENTRY - movq %gs:CPU_KERNEL_STACK,%rdi - xchgq %rdi,%rsp /* switch to kernel stack */ movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ movq TH_TASK(%rcx),%rbx /* point to current task */ incl TH_SYSCALLS_MACH(%rcx) /* increment call count */ @@ -1217,7 +1404,7 @@ Entry(hndl_mach_scall) sti - CCALL(mach_call_munger) + CCALL1(mach_call_munger, %r15) /* * always returns through thread_exception_return */ @@ -1226,9 +1413,6 @@ Entry(hndl_mach_scall) Entry(hndl_mdep_scall) TIME_TRAP_UENTRY - movq %gs:CPU_KERNEL_STACK,%rdi - xchgq %rdi,%rsp /* switch to kernel stack */ - /* Check for active vtimers in the current task */ movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ movq TH_TASK(%rcx),%rbx /* point to current task */ @@ -1236,44 +1420,19 @@ Entry(hndl_mdep_scall) sti - CCALL(machdep_syscall) + CCALL1(machdep_syscall, %r15) /* * always returns through thread_exception_return */ - -Entry(hndl_diag_scall) - TIME_TRAP_UENTRY - - movq %gs:CPU_KERNEL_STACK,%rdi - xchgq %rdi,%rsp /* switch to kernel stack */ - - /* Check for active vtimers in the current task */ - movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ - movq TH_TASK(%rcx),%rbx /* point to current task */ - TASK_VTIMER_CHECK(%rbx,%rcx) - - pushq %rdi /* push pcb stack */ - - CCALL(diagCall) // Call diagnostics - - cli // Disable interruptions just in case - cmpl $0,%eax // What kind of return is this? - je 1f // - branch if bad (zero) - popq %rsp // Get back the pcb stack - jmp EXT(return_to_user) // Normal return, do not check asts... -1: - CCALL3(i386_exception, $EXC_SYSCALL, $0x6000, $1) - // pass what would be the diag syscall - // error return - cause an exception - /* no return */ - - /* * 64bit Tasks * System call entries via syscall only: * - * rsp -> x86_saved_state64_t + * r15 x86_saved_state64_t + * rsp kernel stack + * + * both rsp and r15 are 16-byte aligned * interrupts disabled * direction flag cleared */ @@ -1281,9 +1440,8 @@ Entry(hndl_diag_scall) Entry(hndl_syscall) TIME_TRAP_UENTRY - movq %gs:CPU_KERNEL_STACK,%rdi - xchgq %rdi,%rsp /* switch to kernel stack */ movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ + movl $-1, TH_IOTIER_OVERRIDE(%rcx) /* Reset IO tier override to -1 before handling syscall */ movq TH_TASK(%rcx),%rbx /* point to current task */ /* Check for active vtimers in the current task */ @@ -1293,7 +1451,7 @@ Entry(hndl_syscall) * We can be here either for a mach, unix machdep or diag syscall, * as indicated by the syscall class: */ - movl R64_RAX(%rdi), %eax /* syscall number/class */ + movl R64_RAX(%r15), %eax /* syscall number/class */ movl %eax, %edx andl $(SYSCALL_CLASS_MASK), %edx /* syscall class */ cmpl $(SYSCALL_CLASS_MACH<