X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/fe8ab488e9161c46dd9885d58fc52996dc0249ff..refs/heads/master:/osfmk/i386/mp_desc.c diff --git a/osfmk/i386/mp_desc.c b/osfmk/i386/mp_desc.c index 4b4306ad8..f56db9d51 100644 --- a/osfmk/i386/mp_desc.c +++ b/osfmk/i386/mp_desc.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,34 +22,34 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ @@ -58,8 +58,8 @@ */ #include -#include #include +#include #include #include #include @@ -72,27 +72,31 @@ #include #include #include -#if defined(__i386__) || defined(__x86_64__) +#include #include -#endif /* i386 */ #if CONFIG_MCA #include #endif #include +#if MONOTONIC +#include +#endif /* MONOTONIC */ +#include + #define K_INTR_GATE (ACC_P|ACC_PL_K|ACC_INTR_GATE) #define U_INTR_GATE (ACC_P|ACC_PL_U|ACC_INTR_GATE) // Declare macros that will declare the externs -#define TRAP(n, name) extern void *name ; -#define TRAP_ERR(n, name) extern void *name ; -#define TRAP_SPC(n, name) extern void *name ; -#define TRAP_IST1(n, name) extern void *name ; -#define TRAP_IST2(n, name) extern void *name ; -#define INTERRUPT(n) extern void *_intr_ ## n ; -#define USER_TRAP(n, name) extern void *name ; -#define USER_TRAP_SPC(n, name) extern void *name ; +#define TRAP(n, name) extern void *name ; +#define TRAP_ERR(n, name) extern void *name ; +#define TRAP_SPC(n, name) extern void *name ; +#define TRAP_IST1(n, name) extern void *name ; +#define TRAP_IST2(n, name) extern void *name ; +#define INTERRUPT(n) extern void *_intr_ ## n ; +#define USER_TRAP(n, name) extern void *name ; +#define USER_TRAP_SPC(n, name) extern void *name ; // Include the table to declare the externs #include "../x86_64/idt_table.h" @@ -107,88 +111,102 @@ #undef USER_TRAP #undef USER_TRAP_SPC -#define TRAP(n, name) \ - [n] = { \ - (uintptr_t)&name, \ - KERNEL64_CS, \ - 0, \ - K_INTR_GATE, \ - 0 \ +#define TRAP(n, name) \ + [n] = { \ + (uintptr_t)&name, \ + KERNEL64_CS, \ + 0, \ + K_INTR_GATE, \ + 0 \ }, #define TRAP_ERR TRAP #define TRAP_SPC TRAP #define TRAP_IST1(n, name) \ - [n] = { \ - (uintptr_t)&name, \ - KERNEL64_CS, \ - 1, \ - K_INTR_GATE, \ - 0 \ + [n] = { \ + (uintptr_t)&name, \ + KERNEL64_CS, \ + 1, \ + K_INTR_GATE, \ + 0 \ }, #define TRAP_IST2(n, name) \ - [n] = { \ - (uintptr_t)&name, \ - KERNEL64_CS, \ - 2, \ - K_INTR_GATE, \ - 0 \ + [n] = { \ + (uintptr_t)&name, \ + KERNEL64_CS, \ + 2, \ + K_INTR_GATE, \ + 0 \ }, #define INTERRUPT(n) \ - [n] = { \ - (uintptr_t)&_intr_ ## n,\ - KERNEL64_CS, \ - 0, \ - K_INTR_GATE, \ - 0 \ + [n] = { \ + (uintptr_t)&_intr_ ## n,\ + KERNEL64_CS, \ + 0, \ + K_INTR_GATE, \ + 0 \ }, #define USER_TRAP(n, name) \ - [n] = { \ - (uintptr_t)&name, \ - KERNEL64_CS, \ - 0, \ - U_INTR_GATE, \ - 0 \ + [n] = { \ + (uintptr_t)&name, \ + KERNEL64_CS, \ + 0, \ + U_INTR_GATE, \ + 0 \ }, #define USER_TRAP_SPC USER_TRAP // Declare the table using the macros we just set up struct fake_descriptor64 master_idt64[IDTSZ] - __attribute__ ((section("__HIB,__desc"))) - __attribute__ ((aligned(PAGE_SIZE))) = { +__attribute__ ((section("__HIB,__desc"))) +__attribute__ ((aligned(PAGE_SIZE))) = { #include "../x86_64/idt_table.h" }; /* * First cpu`s interrupt stack. */ -extern uint32_t low_intstack[]; /* bottom */ -extern uint32_t low_eintstack[]; /* top */ +extern uint32_t low_intstack[]; /* bottom */ +extern uint32_t low_eintstack[]; /* top */ /* * Per-cpu data area pointers. - * The master cpu (cpu 0) has its data area statically allocated; - * others are allocated dynamically and this array is updated at runtime. */ -static cpu_data_t cpu_data_master = { - .cpu_this = &cpu_data_master, - .cpu_nanotime = &pal_rtc_nanotime_info, - .cpu_int_stack_top = (vm_offset_t) low_eintstack, +cpu_data_t cpshadows[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc"))); +cpu_data_t scdatas[MAX_CPUS] __attribute__((aligned(64))) = { + [0].cpu_this = &scdatas[0], + [0].cpu_nanotime = &pal_rtc_nanotime_info, + [0].cpu_int_stack_top = (vm_offset_t) low_eintstack, + [0].cd_shadow = &cpshadows[0] }; -cpu_data_t *cpu_data_ptr[MAX_CPUS] = { [0] = &cpu_data_master }; +cpu_data_t *cpu_data_master = &scdatas[0]; + +cpu_data_t *cpu_data_ptr[MAX_CPUS] = {[0] = &scdatas[0] }; -decl_simple_lock_data(,ncpus_lock); /* protects real_ncpus */ -unsigned int real_ncpus = 1; -unsigned int max_ncpus = MAX_CPUS; +SECURITY_READ_ONLY_LATE(struct percpu_base) percpu_base; + +decl_simple_lock_data(, ncpus_lock); /* protects real_ncpus */ +unsigned int real_ncpus = 1; +unsigned int max_ncpus = MAX_CPUS; +unsigned int max_cpus_from_firmware = 0; extern void hi64_sysenter(void); extern void hi64_syscall(void); +typedef struct { + struct real_descriptor pcldts[LDTSZ]; +} cldt_t; + +cpu_desc_table64_t scdtables[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc"))); +cpu_fault_stack_t scfstks[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc"))); + +cldt_t *dyn_ldts; + /* * Multiprocessor i386/i486 systems use a separate copy of the * GDT, IDT, LDT, and kernel TSS per processor. The first three @@ -202,43 +220,6 @@ extern void hi64_syscall(void); * Allocate and initialize the per-processor descriptor tables. */ -struct fake_descriptor ldt_desc_pattern = { - (unsigned int) 0, - LDTSZ_MIN * sizeof(struct fake_descriptor) - 1, - 0, - ACC_P|ACC_PL_K|ACC_LDT -}; - -struct fake_descriptor tss_desc_pattern = { - (unsigned int) 0, - sizeof(struct i386_tss) - 1, - 0, - ACC_P|ACC_PL_K|ACC_TSS -}; - -struct fake_descriptor cpudata_desc_pattern = { - (unsigned int) 0, - sizeof(cpu_data_t)-1, - SZ_32, - ACC_P|ACC_PL_K|ACC_DATA_W -}; - -#if NCOPY_WINDOWS > 0 -struct fake_descriptor userwindow_desc_pattern = { - (unsigned int) 0, - ((NBPDE * NCOPY_WINDOWS) / PAGE_SIZE) - 1, - SZ_32 | SZ_G, - ACC_P|ACC_PL_U|ACC_DATA_W -}; -#endif - -struct fake_descriptor physwindow_desc_pattern = { - (unsigned int) 0, - PAGE_SIZE - 1, - SZ_32, - ACC_P|ACC_PL_K|ACC_DATA_W -}; - /* * This is the expanded, 64-bit variant of the kernel LDT descriptor. * When switching to 64-bit mode this replaces KERNEL_LDT entry @@ -246,11 +227,11 @@ struct fake_descriptor physwindow_desc_pattern = { * in the uber-space remapping window on the kernel. */ struct fake_descriptor64 kernel_ldt_desc64 = { - 0, - LDTSZ_MIN*sizeof(struct fake_descriptor)-1, - 0, - ACC_P|ACC_PL_K|ACC_LDT, - 0 + .offset64 = 0, + .lim_or_seg = LDTSZ_MIN * sizeof(struct fake_descriptor) - 1, + .size_or_IST = 0, + .access = ACC_P | ACC_PL_K | ACC_LDT, + .reserved = 0 }; /* @@ -258,11 +239,11 @@ struct fake_descriptor64 kernel_ldt_desc64 = { * It is follows pattern of the KERNEL_LDT. */ struct fake_descriptor64 kernel_tss_desc64 = { - 0, - sizeof(struct x86_64_tss)-1, - 0, - ACC_P|ACC_PL_K|ACC_TSS, - 0 + .offset64 = 0, + .lim_or_seg = sizeof(struct x86_64_tss) - 1, + .size_or_IST = 0, + .access = ACC_P | ACC_PL_K | ACC_TSS, + .reserved = 0 }; /* @@ -296,8 +277,8 @@ struct fake_descriptor64 kernel_tss_desc64 = { * bytes 6..7 offset 31..16 */ void -fix_desc(void *d, int num_desc) { - //early_kprintf("fix_desc(%x, %x)\n", d, num_desc); +fix_desc(void *d, int num_desc) +{ uint8_t *desc = (uint8_t*) d; do { @@ -306,30 +287,29 @@ fix_desc(void *d, int num_desc) { uint16_t selector; uint8_t wordcount; uint8_t acc; - + offset = *((uint32_t*)(desc)); - selector = *((uint32_t*)(desc+4)); + selector = *((uint32_t*)(desc + 4)); wordcount = desc[6] >> 4; acc = desc[7]; *((uint16_t*)desc) = offset & 0xFFFF; - *((uint16_t*)(desc+2)) = selector; + *((uint16_t*)(desc + 2)) = selector; desc[4] = wordcount; desc[5] = acc; - *((uint16_t*)(desc+6)) = offset >> 16; - + *((uint16_t*)(desc + 6)) = offset >> 16; } else { /* descriptor */ uint32_t base; uint16_t limit; uint8_t acc1, acc2; base = *((uint32_t*)(desc)); - limit = *((uint16_t*)(desc+4)); + limit = *((uint16_t*)(desc + 4)); acc2 = desc[6]; acc1 = desc[7]; *((uint16_t*)(desc)) = limit; - *((uint16_t*)(desc+2)) = base & 0xFFFF; + *((uint16_t*)(desc + 2)) = base & 0xFFFF; desc[4] = (base >> 16) & 0xFF; desc[5] = acc1; desc[6] = acc2; @@ -342,15 +322,15 @@ fix_desc(void *d, int num_desc) { void fix_desc64(void *descp, int count) { - struct fake_descriptor64 *fakep; + struct fake_descriptor64 *fakep; union { - struct real_gate64 gate; - struct real_descriptor64 desc; - } real; - int i; + struct real_gate64 gate; + struct real_descriptor64 desc; + } real; + int i; fakep = (struct fake_descriptor64 *) descp; - + for (i = 0; i < count; i++, fakep++) { /* * Construct the real decriptor locally. @@ -368,10 +348,10 @@ fix_desc64(void *descp, int count) real.gate.selector16 = fakep->lim_or_seg & 0xFFFF; real.gate.IST = fakep->size_or_IST & 0x7; real.gate.access8 = fakep->access; - real.gate.offset_high16 = (uint16_t)((fakep->offset64>>16) & 0xFFFF); - real.gate.offset_top32 = (uint32_t)(fakep->offset64>>32); + real.gate.offset_high16 = (uint16_t)((fakep->offset64 >> 16) & 0xFFFF); + real.gate.offset_top32 = (uint32_t)(fakep->offset64 >> 32); break; - default: /* Otherwise */ + default: /* Otherwise */ real.desc.limit_low16 = fakep->lim_or_seg & 0xFFFF; real.desc.base_low16 = (uint16_t)(fakep->offset64 & 0xFFFF); real.desc.base_med8 = (uint8_t)((fakep->offset64 >> 16) & 0xFF); @@ -379,7 +359,7 @@ fix_desc64(void *descp, int count) real.desc.limit_high4 = (fakep->lim_or_seg >> 16) & 0xFF; real.desc.granularity4 = fakep->size_or_IST; real.desc.base_high8 = (uint8_t)((fakep->offset64 >> 24) & 0xFF); - real.desc.base_top32 = (uint32_t)(fakep->offset64>>32); + real.desc.base_top32 = (uint32_t)(fakep->offset64 >> 32); } /* @@ -389,52 +369,38 @@ fix_desc64(void *descp, int count) } } -static void -cpu_gdt_alias(vm_map_offset_t gdt, vm_map_offset_t alias) -{ - pt_entry_t *pte = NULL; - - /* Require page alignment */ - assert(page_aligned(gdt)); - assert(page_aligned(alias)); - - pte = pmap_pte(kernel_pmap, alias); - pmap_store_pte(pte, kvtophys(gdt) | INTEL_PTE_REF - | INTEL_PTE_MOD - | INTEL_PTE_WIRED - | INTEL_PTE_VALID - | INTEL_PTE_WRITE - | INTEL_PTE_NX); - - /* TLB flush unneccessry because target processor isn't running yet */ -} - - +extern unsigned mldtsz; void -cpu_desc_init64(cpu_data_t *cdp) +cpu_desc_init(cpu_data_t *cdp) { - cpu_desc_index_t *cdi = &cdp->cpu_desc_index; + cpu_desc_index_t *cdi = &cdp->cpu_desc_index; - if (cdp == &cpu_data_master) { + if (cdp == cpu_data_master) { /* - * Master CPU uses the tables built at boot time. - * Just set the index pointers to the low memory space. + * Populate the double-mapped 'u' and base 'b' fields in the + * KTSS with I/G/LDT and sysenter stack data. */ - cdi->cdi_ktss = (void *)&master_ktss64; - cdi->cdi_sstk = (vm_offset_t) &master_sstk.top; - cdi->cdi_gdt.ptr = (void *)MASTER_GDT_ALIAS; - cdi->cdi_idt.ptr = (void *)MASTER_IDT_ALIAS; - cdi->cdi_ldt = (struct fake_descriptor *) master_ldt; + cdi->cdi_ktssu = (void *)DBLMAP(&master_ktss64); + cdi->cdi_ktssb = (void *)&master_ktss64; + cdi->cdi_sstku = (vm_offset_t) DBLMAP(&master_sstk.top); + cdi->cdi_sstkb = (vm_offset_t) &master_sstk.top; + + cdi->cdi_gdtu.ptr = (void *)DBLMAP((uintptr_t) &master_gdt); + cdi->cdi_gdtb.ptr = (void *)&master_gdt; + cdi->cdi_idtu.ptr = (void *)DBLMAP((uintptr_t) &master_idt64); + cdi->cdi_idtb.ptr = (void *)((uintptr_t) &master_idt64); + cdi->cdi_ldtu = (struct real_descriptor *)DBLMAP((uintptr_t)&master_ldt[0]); + cdi->cdi_ldtb = &master_ldt[0]; /* Replace the expanded LDTs and TSS slots in the GDT */ - kernel_ldt_desc64.offset64 = (uintptr_t) &master_ldt; + kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; *(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_LDT)] = - kernel_ldt_desc64; + kernel_ldt_desc64; *(struct fake_descriptor64 *) &master_gdt[sel_idx(USER_LDT)] = - kernel_ldt_desc64; - kernel_tss_desc64.offset64 = (uintptr_t) &master_ktss64; + kernel_ldt_desc64; + kernel_tss_desc64.offset64 = (uintptr_t) DBLMAP(&master_ktss64); *(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_TSS)] = - kernel_tss_desc64; + kernel_tss_desc64; /* Fix up the expanded descriptors for 64-bit. */ fix_desc64((void *) &master_idt64, IDTSZ); @@ -444,77 +410,74 @@ cpu_desc_init64(cpu_data_t *cdp) /* * Set the NMI/fault stacks as IST2/IST1 in the 64-bit TSS - * Note: this will be dynamically re-allocated in VM later. */ master_ktss64.ist2 = (uintptr_t) low_eintstack; - master_ktss64.ist1 = (uintptr_t) low_eintstack - - sizeof(x86_64_intr_stack_frame_t); + master_ktss64.ist1 = (uintptr_t) low_eintstack - sizeof(x86_64_intr_stack_frame_t); + } else if (cdi->cdi_ktssu == NULL) { /* Skipping re-init on wake */ + cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; - } else if (cdi->cdi_ktss == NULL) { /* Skipping re-init on wake */ - cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; + cdi->cdi_idtu.ptr = (void *)DBLMAP((uintptr_t) &master_idt64); - /* - * Per-cpu GDT, IDT, KTSS descriptors are allocated in kernel - * heap (cpu_desc_table). - * LDT descriptors are mapped into a separate area. - * GDT descriptors are addressed by alias to avoid sgdt leaks to user-space. - */ - cdi->cdi_idt.ptr = (void *)MASTER_IDT_ALIAS; - cdi->cdi_gdt.ptr = (void *)CPU_GDT_ALIAS(cdp->cpu_number); - cdi->cdi_ktss = (void *)&cdt->ktss; - cdi->cdi_sstk = (vm_offset_t)&cdt->sstk.top; - cdi->cdi_ldt = cdp->cpu_ldtp; - - /* Make the virtual alias address for the GDT */ - cpu_gdt_alias((vm_map_offset_t) &cdt->gdt, - (vm_map_offset_t) cdi->cdi_gdt.ptr); + cdi->cdi_ktssu = (void *)DBLMAP(&cdt->ktss); + cdi->cdi_ktssb = (void *)(&cdt->ktss); + cdi->cdi_sstku = (vm_offset_t)DBLMAP(&cdt->sstk.top); + cdi->cdi_sstkb = (vm_offset_t)(&cdt->sstk.top); + cdi->cdi_ldtu = (void *)LDTALIAS(cdp->cpu_ldtp); + cdi->cdi_ldtb = (void *)(cdp->cpu_ldtp); /* * Copy the tables */ bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt)); - bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, sizeof(master_ldt)); + bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, mldtsz); bcopy((char *)&master_ktss64, (char *)&cdt->ktss, sizeof(struct x86_64_tss)); - + cdi->cdi_gdtu.ptr = (void *)DBLMAP(cdt->gdt); + cdi->cdi_gdtb.ptr = (void *)(cdt->gdt); /* * Fix up the entries in the GDT to point to * this LDT and this TSS. + * Note reuse of global 'kernel_ldt_desc64, which is not + * concurrency-safe. Higher level synchronization is expected */ - kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldt; + kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_LDT)] = - kernel_ldt_desc64; + kernel_ldt_desc64; fix_desc64(&cdt->gdt[sel_idx(KERNEL_LDT)], 1); - kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldt; + kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(USER_LDT)] = - kernel_ldt_desc64; + kernel_ldt_desc64; fix_desc64(&cdt->gdt[sel_idx(USER_LDT)], 1); - kernel_tss_desc64.offset64 = (uintptr_t) cdi->cdi_ktss; + kernel_tss_desc64.offset64 = (uintptr_t) cdi->cdi_ktssu; *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_TSS)] = - kernel_tss_desc64; + kernel_tss_desc64; fix_desc64(&cdt->gdt[sel_idx(KERNEL_TSS)], 1); /* Set (zeroed) fault stack as IST1, NMI intr stack IST2 */ - bzero((void *) cdt->fstk, sizeof(cdt->fstk)); - cdt->ktss.ist2 = (unsigned long)cdt->fstk + sizeof(cdt->fstk); - cdt->ktss.ist1 = cdt->ktss.ist2 - - sizeof(x86_64_intr_stack_frame_t); + uint8_t *cfstk = &scfstks[cdp->cpu_number].fstk[0]; + cdt->fstkp = cfstk; + bzero((void *) cfstk, FSTK_SZ); + cdt->ktss.ist2 = DBLMAP((uint64_t)cdt->fstkp + FSTK_SZ); + cdt->ktss.ist1 = cdt->ktss.ist2 - sizeof(x86_64_intr_stack_frame_t); } /* Require that the top of the sysenter stack is 16-byte aligned */ - if ((cdi->cdi_sstk % 16) != 0) - panic("cpu_desc_init64() sysenter stack not 16-byte aligned"); + if ((cdi->cdi_sstku % 16) != 0) { + panic("cpu_desc_init() sysenter stack not 16-byte aligned"); + } } - - void -cpu_desc_load64(cpu_data_t *cdp) +cpu_desc_load(cpu_data_t *cdp) { - cpu_desc_index_t *cdi = &cdp->cpu_desc_index; + cpu_desc_index_t *cdi = &cdp->cpu_desc_index; + + postcode(CPU_DESC_LOAD_ENTRY); /* Stuff the kernel per-cpu data area address into the MSRs */ + postcode(CPU_DESC_LOAD_GS_BASE); wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); + postcode(CPU_DESC_LOAD_KERNEL_GS_BASE); wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); /* @@ -525,28 +488,34 @@ cpu_desc_load64(cpu_data_t *cdp) gdt_desc_p(KERNEL_TSS)->access &= ~ACC_TSS_BUSY; /* Load the GDT, LDT, IDT and TSS */ - cdi->cdi_gdt.size = sizeof(struct real_descriptor)*GDTSZ - 1; - cdi->cdi_idt.size = 0x1000 + cdp->cpu_number; - lgdt((uintptr_t *) &cdi->cdi_gdt); - lidt((uintptr_t *) &cdi->cdi_idt); + cdi->cdi_gdtb.size = sizeof(struct real_descriptor) * GDTSZ - 1; + cdi->cdi_gdtu.size = cdi->cdi_gdtb.size; + cdi->cdi_idtb.size = 0x1000 + cdp->cpu_number; + cdi->cdi_idtu.size = cdi->cdi_idtb.size; + + postcode(CPU_DESC_LOAD_GDT); + lgdt((uintptr_t *) &cdi->cdi_gdtu); + postcode(CPU_DESC_LOAD_IDT); + lidt((uintptr_t *) &cdi->cdi_idtu); + postcode(CPU_DESC_LOAD_LDT); lldt(KERNEL_LDT); + postcode(CPU_DESC_LOAD_TSS); set_tr(KERNEL_TSS); -#if GPROF // Hack to enable mcount to work on K64 - __asm__ volatile("mov %0, %%gs" : : "rm" ((unsigned short)(KERNEL_DS))); -#endif + postcode(CPU_DESC_LOAD_EXIT); } - /* * Set MSRs for sysenter/sysexit and syscall/sysret for 64-bit. */ -static void -fast_syscall_init64(__unused cpu_data_t *cdp) +void +cpu_syscall_init(cpu_data_t *cdp) { - wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS); - wrmsr64(MSR_IA32_SYSENTER_EIP, (uintptr_t) hi64_sysenter); - wrmsr64(MSR_IA32_SYSENTER_ESP, current_sstk()); +#pragma unused(cdp) + + wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS); + wrmsr64(MSR_IA32_SYSENTER_EIP, DBLMAP((uintptr_t) hi64_sysenter)); + wrmsr64(MSR_IA32_SYSENTER_ESP, current_cpu_datap()->cpu_desc_index.cdi_sstku); /* Enable syscall/sysret */ wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_SCE); @@ -555,9 +524,8 @@ fast_syscall_init64(__unused cpu_data_t *cdp) * Note USER_CS because sysret uses this + 16 when returning to * 64-bit code. */ - wrmsr64(MSR_IA32_LSTAR, (uintptr_t) hi64_syscall); - wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) | - (((uint64_t)KERNEL64_CS) << 32)); + wrmsr64(MSR_IA32_LSTAR, DBLMAP((uintptr_t) hi64_syscall)); + wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) | (((uint64_t)KERNEL64_CS) << 32)); /* * Emulate eflags cleared by sysenter but note that * we also clear the trace trap to avoid the complications @@ -565,50 +533,86 @@ fast_syscall_init64(__unused cpu_data_t *cdp) * is also cleared to avoid a spurious "task switch" * should we choose to return via an IRET. */ - wrmsr64(MSR_IA32_FMASK, EFL_DF|EFL_IF|EFL_TF|EFL_NT); - + wrmsr64(MSR_IA32_FMASK, EFL_DF | EFL_IF | EFL_TF | EFL_NT); } +extern vm_offset_t dyn_dblmap(vm_offset_t, vm_offset_t); +uint64_t ldt_alias_offset; +__startup_func +static void +cpu_data_startup_init(void) +{ + int flags = KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_PERMANENT | + KMA_ZERO | KMA_KOBJECT; + uint32_t cpus = max_cpus_from_firmware; + vm_size_t size = percpu_section_size() * cpus; + kern_return_t kr; + + percpu_base.size = percpu_section_size(); + if (cpus == 0) { + panic("percpu: max_cpus_from_firmware not yet initialized"); + } + if (cpus == 1) { + percpu_base.start = VM_MAX_KERNEL_ADDRESS; + return; + } + + kr = kmem_alloc_flags(kernel_map, &percpu_base.start, + round_page(size) + 2 * PAGE_SIZE, VM_KERN_MEMORY_CPU, flags); + if (kr != KERN_SUCCESS) { + panic("percpu: kmem_alloc failed (%d)", kr); + } + + percpu_base.start += PAGE_SIZE - percpu_section_start(); + percpu_base.end = percpu_base.start + size - 1; +} +STARTUP(PERCPU, STARTUP_RANK_FIRST, cpu_data_startup_init); cpu_data_t * cpu_data_alloc(boolean_t is_boot_cpu) { - int ret; - cpu_data_t *cdp; + int ret; + cpu_data_t *cdp; if (is_boot_cpu) { assert(real_ncpus == 1); cdp = cpu_datap(0); if (cdp->cpu_processor == NULL) { simple_lock_init(&ncpus_lock, 0); - cdp->cpu_processor = cpu_processor_alloc(TRUE); -#if NCOPY_WINDOWS > 0 - cdp->cpu_pmap = pmap_cpu_alloc(TRUE); -#endif + cdp->cpu_processor = PERCPU_GET_MASTER(processor); } return cdp; } + boolean_t do_ldt_alloc = FALSE; + simple_lock(&ncpus_lock, LCK_GRP_NULL); + int cnum = real_ncpus; + real_ncpus++; + if (dyn_ldts == NULL) { + do_ldt_alloc = TRUE; + } + simple_unlock(&ncpus_lock); + /* * Allocate per-cpu data: */ - ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp, sizeof(cpu_data_t)); - if (ret != KERN_SUCCESS) { - printf("cpu_data_alloc() failed, ret=%d\n", ret); - goto abort; - } + + cdp = &scdatas[cnum]; bzero((void*) cdp, sizeof(cpu_data_t)); cdp->cpu_this = cdp; + cdp->cpu_number = cnum; + cdp->cd_shadow = &cpshadows[cnum]; + cdp->cpu_pcpu_base = percpu_base.start + (cnum - 1) * percpu_section_size(); + cdp->cpu_processor = PERCPU_GET_WITH_BASE(cdp->cpu_pcpu_base, processor); /* * Allocate interrupt stack: */ - ret = kmem_alloc(kernel_map, - (vm_offset_t *) &cdp->cpu_int_stack_top, - INTSTACK_SIZE); + ret = kmem_alloc(kernel_map, + (vm_offset_t *) &cdp->cpu_int_stack_top, + INTSTACK_SIZE, VM_KERN_MEMORY_CPU); if (ret != KERN_SUCCESS) { - printf("cpu_data_alloc() int stack failed, ret=%d\n", ret); - goto abort; + panic("cpu_data_alloc() int stack failed, ret=%d\n", ret); } bzero((void*) cdp->cpu_int_stack_top, INTSTACK_SIZE); cdp->cpu_int_stack_top += INTSTACK_SIZE; @@ -616,37 +620,49 @@ cpu_data_alloc(boolean_t is_boot_cpu) /* * Allocate descriptor table: */ - ret = kmem_alloc(kernel_map, - (vm_offset_t *) &cdp->cpu_desc_tablep, - sizeof(cpu_desc_table64_t)); - if (ret != KERN_SUCCESS) { - printf("cpu_data_alloc() desc_table failed, ret=%d\n", ret); - goto abort; - } + cdp->cpu_desc_tablep = (struct cpu_desc_table *) &scdtables[cnum]; /* * Allocate LDT */ - ret = kmem_alloc(kernel_map, - (vm_offset_t *) &cdp->cpu_ldtp, - sizeof(struct real_descriptor) * LDTSZ); - if (ret != KERN_SUCCESS) { - printf("cpu_data_alloc() ldt failed, ret=%d\n", ret); - goto abort; + if (do_ldt_alloc) { + boolean_t do_ldt_free = FALSE; + vm_offset_t sldtoffset = 0; + /* + * Allocate LDT + */ + vm_offset_t ldtalloc = 0, ldtallocsz = round_page_64(MAX_CPUS * sizeof(struct real_descriptor) * LDTSZ); + ret = kmem_alloc(kernel_map, (vm_offset_t *) &ldtalloc, ldtallocsz, VM_KERN_MEMORY_CPU); + if (ret != KERN_SUCCESS) { + panic("cpu_data_alloc() ldt failed, kmem_alloc=%d\n", ret); + } + + simple_lock(&ncpus_lock, LCK_GRP_NULL); + if (dyn_ldts == NULL) { + dyn_ldts = (cldt_t *)ldtalloc; + } else { + do_ldt_free = TRUE; + } + simple_unlock(&ncpus_lock); + + if (do_ldt_free) { + kmem_free(kernel_map, ldtalloc, ldtallocsz); + } else { + /* CPU registration and startup are expected to execute + * serially, as invoked by the platform driver. + * Create trampoline alias of LDT region. + */ + sldtoffset = dyn_dblmap(ldtalloc, ldtallocsz); + ldt_alias_offset = sldtoffset; + } } + cdp->cpu_ldtp = &dyn_ldts[cnum].pcldts[0]; #if CONFIG_MCA /* Machine-check shadow register allocation. */ mca_cpu_alloc(cdp); #endif - simple_lock(&ncpus_lock); - - cpu_data_ptr[real_ncpus] = cdp; - cdp->cpu_number = real_ncpus; - real_ncpus++; - simple_unlock(&ncpus_lock); - /* * Before this cpu has been assigned a real thread context, * we give it a fake, unique, non-zero thread id which the locking @@ -656,197 +672,104 @@ cpu_data_alloc(boolean_t is_boot_cpu) * started. */ cdp->cpu_active_thread = (thread_t) (uintptr_t) cdp->cpu_number; - + cdp->cpu_NMI_acknowledged = TRUE; cdp->cpu_nanotime = &pal_rtc_nanotime_info; kprintf("cpu_data_alloc(%d) %p desc_table: %p " - "ldt: %p " - "int_stack: 0x%lx-0x%lx\n", - cdp->cpu_number, cdp, cdp->cpu_desc_tablep, cdp->cpu_ldtp, - (long)(cdp->cpu_int_stack_top - INTSTACK_SIZE), (long)(cdp->cpu_int_stack_top)); + "ldt: %p " + "int_stack: 0x%lx-0x%lx\n", + cdp->cpu_number, cdp, cdp->cpu_desc_tablep, cdp->cpu_ldtp, + (long)(cdp->cpu_int_stack_top - INTSTACK_SIZE), (long)(cdp->cpu_int_stack_top)); + cpu_data_ptr[cnum] = cdp; return cdp; - -abort: - if (cdp) { - if (cdp->cpu_desc_tablep) - kfree((void *) cdp->cpu_desc_tablep, - sizeof(cpu_desc_table64_t)); - if (cdp->cpu_int_stack_top) - kfree((void *) (cdp->cpu_int_stack_top - INTSTACK_SIZE), - INTSTACK_SIZE); - kfree((void *) cdp, sizeof(*cdp)); - } - return NULL; } boolean_t valid_user_data_selector(uint16_t selector) { - sel_t sel = selector_to_sel(selector); - - if (selector == 0) - return (TRUE); - - if (sel.ti == SEL_LDT) - return (TRUE); - else if (sel.index < GDTSZ) { - if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) - return (TRUE); - } - - return (FALSE); -} + sel_t sel = selector_to_sel(selector); -boolean_t -valid_user_code_selector(uint16_t selector) -{ - sel_t sel = selector_to_sel(selector); - - if (selector == 0) - return (FALSE); - - if (sel.ti == SEL_LDT) { - if (sel.rpl == USER_PRIV) - return (TRUE); - } - else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { - if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) - return (TRUE); - } - - return (FALSE); -} + if (selector == 0) { + return TRUE; + } -boolean_t -valid_user_stack_selector(uint16_t selector) -{ - sel_t sel = selector_to_sel(selector); - - if (selector == 0) - return (FALSE); - - if (sel.ti == SEL_LDT) { - if (sel.rpl == USER_PRIV) - return (TRUE); - } - else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { - if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) - return (TRUE); - } - - return (FALSE); + if (sel.ti == SEL_LDT) { + return TRUE; + } else if (sel.index < GDTSZ) { + if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) { + return TRUE; + } + } + return FALSE; } boolean_t -valid_user_segment_selectors(uint16_t cs, - uint16_t ss, - uint16_t ds, - uint16_t es, - uint16_t fs, - uint16_t gs) -{ - return valid_user_code_selector(cs) && - valid_user_stack_selector(ss) && - valid_user_data_selector(ds) && - valid_user_data_selector(es) && - valid_user_data_selector(fs) && - valid_user_data_selector(gs); -} - -#if NCOPY_WINDOWS > 0 - -static vm_offset_t user_window_base = 0; - -void -cpu_userwindow_init(int cpu) +valid_user_code_selector(uint16_t selector) { - cpu_data_t *cdp = cpu_data_ptr[cpu]; - vm_offset_t user_window; - vm_offset_t vaddr; - int num_cpus; - - num_cpus = ml_get_max_cpus(); - - if (cpu >= num_cpus) - panic("cpu_userwindow_init: cpu > num_cpus"); - - if (user_window_base == 0) { + sel_t sel = selector_to_sel(selector); - if (vm_allocate(kernel_map, &vaddr, - (NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE, - VM_FLAGS_ANYWHERE) != KERN_SUCCESS) - panic("cpu_userwindow_init: " - "couldn't allocate user map window"); - - /* - * window must start on a page table boundary - * in the virtual address space - */ - user_window_base = (vaddr + (NBPDE - 1)) & ~(NBPDE - 1); - - /* - * get rid of any allocation leading up to our - * starting boundary - */ - vm_deallocate(kernel_map, vaddr, user_window_base - vaddr); + if (selector == 0) { + return FALSE; + } - /* - * get rid of tail that we don't need + if (sel.ti == SEL_LDT) { + if (sel.rpl == USER_PRIV) { + return TRUE; + } + } else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { + if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) { + return TRUE; + } + /* Explicitly validate the system code selectors + * even if not instantaneously privileged, + * since they are dynamically re-privileged + * at context switch */ - user_window = user_window_base + - (NBPDE * NCOPY_WINDOWS * num_cpus); - - vm_deallocate(kernel_map, user_window, - (vaddr + - ((NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE)) - - user_window); + if ((selector == USER_CS) || (selector == USER64_CS)) { + return TRUE; + } } - user_window = user_window_base + (cpu * NCOPY_WINDOWS * NBPDE); - - cdp->cpu_copywindow_base = user_window; - /* - * Abuse this pdp entry, the pdp now actually points to - * an array of copy windows addresses. - */ - cdp->cpu_copywindow_pdp = pmap_pde(kernel_pmap, user_window); - + return FALSE; } -void -cpu_physwindow_init(int cpu) +boolean_t +valid_user_stack_selector(uint16_t selector) { - cpu_data_t *cdp = cpu_data_ptr[cpu]; - vm_offset_t phys_window = cdp->cpu_physwindow_base; - - if (phys_window == 0) { - if (vm_allocate(kernel_map, &phys_window, - PAGE_SIZE, VM_FLAGS_ANYWHERE) - != KERN_SUCCESS) - panic("cpu_physwindow_init: " - "couldn't allocate phys map window"); + sel_t sel = selector_to_sel(selector); - /* - * make sure the page that encompasses the - * pte pointer we're interested in actually - * exists in the page table - */ - pmap_expand(kernel_pmap, phys_window, PMAP_EXPAND_OPTIONS_NONE); + if (selector == 0) { + return FALSE; + } - cdp->cpu_physwindow_base = phys_window; - cdp->cpu_physwindow_ptep = vtopte(phys_window); + if (sel.ti == SEL_LDT) { + if (sel.rpl == USER_PRIV) { + return TRUE; + } + } else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { + if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) { + return TRUE; + } } + + return FALSE; } -#endif /* NCOPY_WINDOWS > 0 */ -/* - * Load the segment descriptor tables for the current processor. - */ -void -cpu_mode_init(cpu_data_t *cdp) +boolean_t +valid_user_segment_selectors(uint16_t cs, + uint16_t ss, + uint16_t ds, + uint16_t es, + uint16_t fs, + uint16_t gs) { - fast_syscall_init64(cdp); + return valid_user_code_selector(cs) && + valid_user_stack_selector(ss) && + valid_user_data_selector(ds) && + valid_user_data_selector(es) && + valid_user_data_selector(fs) && + valid_user_data_selector(gs); } /* @@ -857,23 +780,19 @@ cpu_mode_init(cpu_data_t *cdp) void cpu_data_realloc(void) { - int ret; - vm_offset_t istk; - vm_offset_t fstk; - cpu_data_t *cdp; - boolean_t istate; + int ret; + vm_offset_t istk; + cpu_data_t *cdp; + boolean_t istate; - ret = kmem_alloc(kernel_map, &istk, INTSTACK_SIZE); + ret = kmem_alloc(kernel_map, &istk, INTSTACK_SIZE, VM_KERN_MEMORY_CPU); if (ret != KERN_SUCCESS) { panic("cpu_data_realloc() stack alloc, ret=%d\n", ret); } bzero((void*) istk, INTSTACK_SIZE); istk += INTSTACK_SIZE; - ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp, sizeof(cpu_data_t)); - if (ret != KERN_SUCCESS) { - panic("cpu_data_realloc() cpu data alloc, ret=%d\n", ret); - } + cdp = &scdatas[0]; /* Copy old contents into new area and make fix-ups */ assert(cpu_number() == 0); @@ -881,28 +800,25 @@ cpu_data_realloc(void) cdp->cpu_this = cdp; cdp->cpu_int_stack_top = istk; timer_call_queue_init(&cdp->rtclock_timer.queue); + cdp->cpu_desc_tablep = (struct cpu_desc_table *) &scdtables[0]; + cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; - /* Allocate the separate fault stack */ - ret = kmem_alloc(kernel_map, &fstk, PAGE_SIZE); - if (ret != KERN_SUCCESS) { - panic("cpu_data_realloc() fault stack alloc, ret=%d\n", ret); - } - bzero((void*) fstk, PAGE_SIZE); - fstk += PAGE_SIZE; + uint8_t *cfstk = &scfstks[cdp->cpu_number].fstk[0]; + cdt->fstkp = cfstk; + cfstk += FSTK_SZ; /* * With interrupts disabled commmit the new areas. */ istate = ml_set_interrupts_enabled(FALSE); cpu_data_ptr[0] = cdp; - master_ktss64.ist2 = (uintptr_t) fstk; - master_ktss64.ist1 = (uintptr_t) fstk - - sizeof(x86_64_intr_stack_frame_t); + master_ktss64.ist2 = DBLMAP((uintptr_t) cfstk); + master_ktss64.ist1 = DBLMAP((uintptr_t) cfstk - sizeof(x86_64_intr_stack_frame_t)); wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); (void) ml_set_interrupts_enabled(istate); kprintf("Reallocated master cpu data: %p," - " interrupt stack: %p, fault stack: %p\n", - (void *) cdp, (void *) istk, (void *) fstk); + " interrupt stack: %p, fault stack: %p\n", + (void *) cdp, (void *) istk, (void *) cfstk); }