X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/6d2010ae8f7a6078e10b361c6962983bab233e0f..527f99514973766e9c0382a4d8550dfb00f54939:/osfmk/i386/mp_desc.c diff --git a/osfmk/i386/mp_desc.c b/osfmk/i386/mp_desc.c index 2421dc734..788e71663 100644 --- a/osfmk/i386/mp_desc.c +++ b/osfmk/i386/mp_desc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -62,26 +62,29 @@ #include #include #include -#include #include #include #include #include -#include +#include #include #include #include #include +#include +#include #if CONFIG_MCA #include #endif #include -#include +#if MONOTONIC +#include +#endif /* MONOTONIC */ +#include -#ifdef __x86_64__ #define K_INTR_GATE (ACC_P|ACC_PL_K|ACC_INTR_GATE) #define U_INTR_GATE (ACC_P|ACC_PL_U|ACC_INTR_GATE) @@ -89,7 +92,8 @@ #define TRAP(n, name) extern void *name ; #define TRAP_ERR(n, name) extern void *name ; #define TRAP_SPC(n, name) extern void *name ; -#define TRAP_IST(n, name) extern void *name ; +#define TRAP_IST1(n, name) extern void *name ; +#define TRAP_IST2(n, name) extern void *name ; #define INTERRUPT(n) extern void *_intr_ ## n ; #define USER_TRAP(n, name) extern void *name ; #define USER_TRAP_SPC(n, name) extern void *name ; @@ -101,7 +105,8 @@ #undef TRAP #undef TRAP_ERR #undef TRAP_SPC -#undef TRAP_IST +#undef TRAP_IST1 +#undef TRAP_IST2 #undef INTERRUPT #undef USER_TRAP #undef USER_TRAP_SPC @@ -118,7 +123,7 @@ #define TRAP_ERR TRAP #define TRAP_SPC TRAP -#define TRAP_IST(n, name) \ +#define TRAP_IST1(n, name) \ [n] = { \ (uintptr_t)&name, \ KERNEL64_CS, \ @@ -127,6 +132,15 @@ 0 \ }, +#define TRAP_IST2(n, name) \ + [n] = { \ + (uintptr_t)&name, \ + KERNEL64_CS, \ + 2, \ + K_INTR_GATE, \ + 0 \ + }, + #define INTERRUPT(n) \ [n] = { \ (uintptr_t)&_intr_ ## n,\ @@ -146,68 +160,49 @@ }, #define USER_TRAP_SPC USER_TRAP - // Declare the table using the macros we just set up -struct fake_descriptor64 master_idt64[IDTSZ] __attribute__ ((aligned (4096))) = { +struct fake_descriptor64 master_idt64[IDTSZ] + __attribute__ ((section("__HIB,__desc"))) + __attribute__ ((aligned(PAGE_SIZE))) = { #include "../x86_64/idt_table.h" }; -#endif - -/* - * The i386 needs an interrupt stack to keep the PCB stack from being - * overrun by interrupts. All interrupt stacks MUST lie at lower addresses - * than any thread`s kernel stack. - */ /* * First cpu`s interrupt stack. */ -extern uint32_t low_intstack[]; /* bottom */ +extern uint32_t low_intstack[]; /* bottom */ extern uint32_t low_eintstack[]; /* top */ /* * Per-cpu data area pointers. - * The master cpu (cpu 0) has its data area statically allocated; - * others are allocated dynamically and this array is updated at runtime. */ -cpu_data_t cpu_data_master = { - .cpu_this = &cpu_data_master, - .cpu_nanotime = &pal_rtc_nanotime_info, - .cpu_int_stack_top = (vm_offset_t) low_eintstack, -#ifdef __i386__ - .cpu_is64bit = FALSE, -#else - .cpu_is64bit = TRUE -#endif +cpu_data_t cpshadows[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc"))); +cpu_data_t scdatas[MAX_CPUS] __attribute__((aligned(64))) = { + [0].cpu_this = &scdatas[0], + [0].cpu_nanotime = &pal_rtc_nanotime_info, + [0].cpu_int_stack_top = (vm_offset_t) low_eintstack, + [0].cd_shadow = &cpshadows[0] }; -cpu_data_t *cpu_data_ptr[MAX_CPUS] = { [0] = &cpu_data_master }; +cpu_data_t *cpu_data_master = &scdatas[0]; + +cpu_data_t *cpu_data_ptr[MAX_CPUS] = { [0] = &scdatas[0] }; decl_simple_lock_data(,ncpus_lock); /* protects real_ncpus */ unsigned int real_ncpus = 1; unsigned int max_ncpus = MAX_CPUS; -#ifdef __i386__ -extern void *hi_remap_text; -#define HI_TEXT(lo_text) \ - (((uint32_t)&lo_text - (uint32_t)&hi_remap_text) + HIGH_MEM_BASE) - -extern void hi_sysenter(void); +extern void hi64_sysenter(void); +extern void hi64_syscall(void); typedef struct { - uint16_t length; - uint32_t offset[2]; -} __attribute__((__packed__)) table_descriptor64_t; + struct real_descriptor pcldts[LDTSZ]; +} cldt_t; -extern table_descriptor64_t gdtptr64; -extern table_descriptor64_t idtptr64; -#endif -extern void hi64_sysenter(void); -extern void hi64_syscall(void); +cpu_desc_table64_t scdtables[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc"))); +cpu_fault_stack_t scfstks[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc"))); -#if defined(__x86_64__) && !defined(UBER64) -#define UBER64(x) ((uintptr_t)x) -#endif +cldt_t *dyn_ldts; /* * Multiprocessor i386/i486 systems use a separate copy of the @@ -222,41 +217,6 @@ extern void hi64_syscall(void); * Allocate and initialize the per-processor descriptor tables. */ -struct fake_descriptor ldt_desc_pattern = { - (unsigned int) 0, - LDTSZ_MIN * sizeof(struct fake_descriptor) - 1, - 0, - ACC_P|ACC_PL_K|ACC_LDT -}; - -struct fake_descriptor tss_desc_pattern = { - (unsigned int) 0, - sizeof(struct i386_tss) - 1, - 0, - ACC_P|ACC_PL_K|ACC_TSS -}; - -struct fake_descriptor cpudata_desc_pattern = { - (unsigned int) 0, - sizeof(cpu_data_t)-1, - SZ_32, - ACC_P|ACC_PL_K|ACC_DATA_W -}; - -struct fake_descriptor userwindow_desc_pattern = { - (unsigned int) 0, - ((NBPDE * NCOPY_WINDOWS) / PAGE_SIZE) - 1, - SZ_32 | SZ_G, - ACC_P|ACC_PL_U|ACC_DATA_W -}; - -struct fake_descriptor physwindow_desc_pattern = { - (unsigned int) 0, - PAGE_SIZE - 1, - SZ_32, - ACC_P|ACC_PL_K|ACC_DATA_W -}; - /* * This is the expanded, 64-bit variant of the kernel LDT descriptor. * When switching to 64-bit mode this replaces KERNEL_LDT entry @@ -407,179 +367,36 @@ fix_desc64(void *descp, int count) } } -#ifdef __i386__ +extern unsigned mldtsz; void cpu_desc_init(cpu_data_t *cdp) { cpu_desc_index_t *cdi = &cdp->cpu_desc_index; - if (cdp == &cpu_data_master) { - /* - * Fix up the entries in the GDT to point to - * this LDT and this TSS. - */ - struct fake_descriptor temp_fake_desc; - temp_fake_desc = ldt_desc_pattern; - temp_fake_desc.offset = (vm_offset_t) &master_ldt; - fix_desc(&temp_fake_desc, 1); - *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_LDT)] = - temp_fake_desc; - *(struct fake_descriptor *) &master_gdt[sel_idx(USER_LDT)] = - temp_fake_desc; - - temp_fake_desc = tss_desc_pattern; - temp_fake_desc.offset = (vm_offset_t) &master_ktss; - fix_desc(&temp_fake_desc, 1); - *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_TSS)] = - temp_fake_desc; - -#if MACH_KDB - temp_fake_desc = tss_desc_pattern; - temp_fake_desc.offset = (vm_offset_t) &master_dbtss; - fix_desc(&temp_fake_desc, 1); - *(struct fake_descriptor *) &master_gdt[sel_idx(DEBUG_TSS)] = - temp_fake_desc; -#endif - - temp_fake_desc = cpudata_desc_pattern; - temp_fake_desc.offset = (vm_offset_t) &cpu_data_master; - fix_desc(&temp_fake_desc, 1); - *(struct fake_descriptor *) &master_gdt[sel_idx(CPU_DATA_GS)] = - temp_fake_desc; - - fix_desc((void *)&master_idt, IDTSZ); - - cdi->cdi_idt.ptr = master_idt; - cdi->cdi_gdt.ptr = (void *)master_gdt; - - - /* - * Master CPU uses the tables built at boot time. - * Just set the index pointers to the high shared-mapping space. - * Note that the sysenter stack uses empty space above the ktss - * in the HIGH_FIXED_KTSS page. In this case we don't map the - * the real master_sstk in low memory. - */ - cdi->cdi_ktss = (struct i386_tss *) - pmap_index_to_virt(HIGH_FIXED_KTSS) ; - cdi->cdi_sstk = (vm_offset_t) (cdi->cdi_ktss + 1) + - (vm_offset_t) &master_sstk.top - - (vm_offset_t) &master_sstk; - } else { - cpu_desc_table_t *cdt = (cpu_desc_table_t *) cdp->cpu_desc_tablep; - - vm_offset_t cpu_hi_desc; - - cpu_hi_desc = pmap_cpu_high_shared_remap( - cdp->cpu_number, - HIGH_CPU_DESC, - (vm_offset_t) cdt, 1); - - /* - * Per-cpu GDT, IDT, LDT, KTSS descriptors are allocated in one - * block (cpu_desc_table) and double-mapped into high shared space - * in one page window. - * Also, a transient stack for the fast sysenter path. The top of - * which is set at context switch time to point to the PCB using - * the high address. - */ - cdi->cdi_gdt.ptr = (struct fake_descriptor *) (cpu_hi_desc + - offsetof(cpu_desc_table_t, gdt[0])); - cdi->cdi_idt.ptr = (struct fake_descriptor *) (cpu_hi_desc + - offsetof(cpu_desc_table_t, idt[0])); - cdi->cdi_ktss = (struct i386_tss *) (cpu_hi_desc + - offsetof(cpu_desc_table_t, ktss)); - cdi->cdi_sstk = cpu_hi_desc + offsetof(cpu_desc_table_t, sstk.top); - - /* - * LDT descriptors are mapped into a seperate area. - */ - cdi->cdi_ldt = (struct fake_descriptor *) - pmap_cpu_high_shared_remap( - cdp->cpu_number, - HIGH_CPU_LDT_BEGIN, - (vm_offset_t) cdp->cpu_ldtp, - HIGH_CPU_LDT_END - HIGH_CPU_LDT_BEGIN + 1); - - /* - * Copy the tables - */ - bcopy((char *)master_idt, (char *)cdt->idt, sizeof(master_idt)); - bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt)); - bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, sizeof(master_ldt)); - bzero((char *)&cdt->ktss, sizeof(struct i386_tss)); -#if MACH_KDB - cdi->cdi_dbtss = (struct i386_tss *) (cpu_hi_desc + - offsetof(cpu_desc_table_t, dbtss)); - bcopy((char *)&master_dbtss, - (char *)&cdt->dbtss, - sizeof(struct i386_tss)); -#endif /* MACH_KDB */ - - /* - * Fix up the entries in the GDT to point to - * this LDT and this TSS. - */ - struct fake_descriptor temp_ldt = ldt_desc_pattern; - temp_ldt.offset = (vm_offset_t)cdi->cdi_ldt; - fix_desc(&temp_ldt, 1); - - cdt->gdt[sel_idx(KERNEL_LDT)] = temp_ldt; - cdt->gdt[sel_idx(USER_LDT)] = temp_ldt; - - cdt->gdt[sel_idx(KERNEL_TSS)] = tss_desc_pattern; - cdt->gdt[sel_idx(KERNEL_TSS)].offset = (vm_offset_t) cdi->cdi_ktss; - fix_desc(&cdt->gdt[sel_idx(KERNEL_TSS)], 1); - - cdt->gdt[sel_idx(CPU_DATA_GS)] = cpudata_desc_pattern; - cdt->gdt[sel_idx(CPU_DATA_GS)].offset = (vm_offset_t) cdp; - fix_desc(&cdt->gdt[sel_idx(CPU_DATA_GS)], 1); - -#if MACH_KDB /* this only works for legacy 32-bit machines */ - cdt->gdt[sel_idx(DEBUG_TSS)] = tss_desc_pattern; - cdt->gdt[sel_idx(DEBUG_TSS)].offset = (vm_offset_t) cdi->cdi_dbtss; - fix_desc(&cdt->gdt[sel_idx(DEBUG_TSS)], 1); - - cdt->dbtss.esp0 = (int)(db_task_stack_store + - (INTSTACK_SIZE * (cdp->cpu_number + 1)) - sizeof (natural_t)); - cdt->dbtss.esp = cdt->dbtss.esp0; - cdt->dbtss.eip = (int)&db_task_start; -#endif /* MACH_KDB */ - - cdt->ktss.ss0 = KERNEL_DS; - cdt->ktss.io_bit_map_offset = 0x0FFF; /* no IO bitmap */ - - cpu_userwindow_init(cdp->cpu_number); - cpu_physwindow_init(cdp->cpu_number); - - } -} -#endif /* __i386__ */ - -void -cpu_desc_init64(cpu_data_t *cdp) -{ - cpu_desc_index_t *cdi = &cdp->cpu_desc_index; - - if (cdp == &cpu_data_master) { + if (cdp == cpu_data_master) { /* - * Master CPU uses the tables built at boot time. - * Just set the index pointers to the low memory space. + * Populate the double-mapped 'u' and base 'b' fields in the + * KTSS with I/G/LDT and sysenter stack data. */ - cdi->cdi_ktss = (void *)&master_ktss64; - cdi->cdi_sstk = (vm_offset_t) &master_sstk.top; - cdi->cdi_gdt.ptr = (void *)master_gdt; - cdi->cdi_idt.ptr = (void *)master_idt64; - cdi->cdi_ldt = (struct fake_descriptor *) master_ldt; - + cdi->cdi_ktssu = (void *)DBLMAP(&master_ktss64); + cdi->cdi_ktssb = (void *)&master_ktss64; + cdi->cdi_sstku = (vm_offset_t) DBLMAP(&master_sstk.top); + cdi->cdi_sstkb = (vm_offset_t) &master_sstk.top; + + cdi->cdi_gdtu.ptr = (void *)DBLMAP((uintptr_t) &master_gdt); + cdi->cdi_gdtb.ptr = (void *)&master_gdt; + cdi->cdi_idtu.ptr = (void *)DBLMAP((uintptr_t) &master_idt64); + cdi->cdi_idtb.ptr = (void *)((uintptr_t) &master_idt64); + cdi->cdi_ldtu = (struct fake_descriptor *) (void *) DBLMAP((uintptr_t)&master_ldt[0]); + cdi->cdi_ldtb = (struct fake_descriptor *) (void *) &master_ldt[0]; /* Replace the expanded LDTs and TSS slots in the GDT */ - kernel_ldt_desc64.offset64 = UBER64(&master_ldt); + kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; *(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_LDT)] = kernel_ldt_desc64; *(struct fake_descriptor64 *) &master_gdt[sel_idx(USER_LDT)] = kernel_ldt_desc64; - kernel_tss_desc64.offset64 = UBER64(&master_ktss64); + kernel_tss_desc64.offset64 = (uintptr_t) DBLMAP(&master_ktss64); *(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_TSS)] = kernel_tss_desc64; @@ -590,157 +407,118 @@ cpu_desc_init64(cpu_data_t *cdp) fix_desc64((void *) &master_gdt[sel_idx(KERNEL_TSS)], 1); /* - * Set the double-fault stack as IST1 in the 64-bit TSS + * Set the NMI/fault stacks as IST2/IST1 in the 64-bit TSS */ - master_ktss64.ist1 = UBER64((uintptr_t) df_task_stack_end); - - } else { + master_ktss64.ist2 = (uintptr_t) low_eintstack; + master_ktss64.ist1 = (uintptr_t) low_eintstack - sizeof(x86_64_intr_stack_frame_t); + } else if (cdi->cdi_ktssu == NULL) { /* Skipping re-init on wake */ cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; - /* - * Per-cpu GDT, IDT, KTSS descriptors are allocated in kernel - * heap (cpu_desc_table). - * LDT descriptors are mapped into a separate area. - */ - cdi->cdi_gdt.ptr = (struct fake_descriptor *)cdt->gdt; - cdi->cdi_idt.ptr = (void *)cdt->idt; - cdi->cdi_ktss = (void *)&cdt->ktss; - cdi->cdi_sstk = (vm_offset_t)&cdt->sstk.top; - cdi->cdi_ldt = cdp->cpu_ldtp; + + cdi->cdi_idtu.ptr = (void *)DBLMAP((uintptr_t) &master_idt64); + + cdi->cdi_ktssu = (void *)DBLMAP(&cdt->ktss); + cdi->cdi_ktssb = (void *)(&cdt->ktss); + cdi->cdi_sstku = (vm_offset_t)DBLMAP(&cdt->sstk.top); + cdi->cdi_sstkb = (vm_offset_t)(&cdt->sstk.top); + cdi->cdi_ldtu = (void *)LDTALIAS(cdp->cpu_ldtp); + cdi->cdi_ldtb = (void *)(cdp->cpu_ldtp); /* * Copy the tables */ - bcopy((char *)master_idt64, (char *)cdt->idt, sizeof(master_idt64)); bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt)); - bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, sizeof(master_ldt)); + bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, mldtsz); bcopy((char *)&master_ktss64, (char *)&cdt->ktss, sizeof(struct x86_64_tss)); - + cdi->cdi_gdtu.ptr = (void *)DBLMAP(cdt->gdt); + cdi->cdi_gdtb.ptr = (void *)(cdt->gdt); /* * Fix up the entries in the GDT to point to * this LDT and this TSS. + * Note reuse of global 'kernel_ldt_desc64, which is not + * concurrency-safe. Higher level synchronization is expected */ - kernel_ldt_desc64.offset64 = UBER64(cdi->cdi_ldt); + kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_LDT)] = kernel_ldt_desc64; fix_desc64(&cdt->gdt[sel_idx(KERNEL_LDT)], 1); - kernel_ldt_desc64.offset64 = UBER64(cdi->cdi_ldt); + kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(USER_LDT)] = kernel_ldt_desc64; fix_desc64(&cdt->gdt[sel_idx(USER_LDT)], 1); - kernel_tss_desc64.offset64 = UBER64(cdi->cdi_ktss); + kernel_tss_desc64.offset64 = (uintptr_t) cdi->cdi_ktssu; *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_TSS)] = kernel_tss_desc64; fix_desc64(&cdt->gdt[sel_idx(KERNEL_TSS)], 1); - /* Set (zeroed) double-fault stack as IST1 */ - bzero((void *) cdt->dfstk, sizeof(cdt->dfstk)); - cdt->ktss.ist1 = UBER64((unsigned long)cdt->dfstk + sizeof(cdt->dfstk)); -#ifdef __i386__ - cdt->gdt[sel_idx(CPU_DATA_GS)] = cpudata_desc_pattern; - cdt->gdt[sel_idx(CPU_DATA_GS)].offset = (vm_offset_t) cdp; - fix_desc(&cdt->gdt[sel_idx(CPU_DATA_GS)], 1); - - /* Allocate copyio windows */ - cpu_userwindow_init(cdp->cpu_number); - cpu_physwindow_init(cdp->cpu_number); -#endif + /* Set (zeroed) fault stack as IST1, NMI intr stack IST2 */ + uint8_t *cfstk = &scfstks[cdp->cpu_number].fstk[0]; + cdt->fstkp = cfstk; + bzero((void *) cfstk, FSTK_SZ); + cdt->ktss.ist2 = DBLMAP((uint64_t)cdt->fstkp + FSTK_SZ); + cdt->ktss.ist1 = cdt->ktss.ist2 - sizeof(x86_64_intr_stack_frame_t); } /* Require that the top of the sysenter stack is 16-byte aligned */ - if ((cdi->cdi_sstk % 16) != 0) - panic("cpu_desc_init64() sysenter stack not 16-byte aligned"); + if ((cdi->cdi_sstku % 16) != 0) + panic("cpu_desc_init() sysenter stack not 16-byte aligned"); } - -#ifdef __i386__ void cpu_desc_load(cpu_data_t *cdp) { cpu_desc_index_t *cdi = &cdp->cpu_desc_index; - cdi->cdi_idt.size = 0x1000 + cdp->cpu_number; - cdi->cdi_gdt.size = sizeof(struct real_descriptor)*GDTSZ - 1; - - lgdt((unsigned long *) &cdi->cdi_gdt); - lidt((unsigned long *) &cdi->cdi_idt); - lldt(KERNEL_LDT); - - set_tr(KERNEL_TSS); + postcode(CPU_DESC_LOAD_ENTRY); - __asm__ volatile("mov %0, %%gs" : : "rm" ((unsigned short)(CPU_DATA_GS))); -} -#endif /* __i386__ */ - -void -cpu_desc_load64(cpu_data_t *cdp) -{ - cpu_desc_index_t *cdi = &cdp->cpu_desc_index; + /* Stuff the kernel per-cpu data area address into the MSRs */ + postcode(CPU_DESC_LOAD_GS_BASE); + wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); + postcode(CPU_DESC_LOAD_KERNEL_GS_BASE); + wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); -#ifdef __i386__ /* - * Load up the new descriptors etc - * ml_load_desc64() expects these global pseudo-descriptors: - * gdtptr64 -> per-cpu gdt - * idtptr64 -> per-cpu idt - * These are 10-byte descriptors with 64-bit addresses into - * uber-space. - * - * Refer to commpage/cpu_number.s for the IDT limit trick. + * Ensure the TSS segment's busy bit is clear. This is required + * for the case of reloading descriptors at wake to avoid + * their complete re-initialization. */ - gdtptr64.length = GDTSZ * sizeof(struct real_descriptor) - 1; - gdtptr64.offset[0] = (uint32_t) cdi->cdi_gdt.ptr; - gdtptr64.offset[1] = KERNEL_UBER_BASE_HI32; - idtptr64.length = 0x1000 + cdp->cpu_number; - idtptr64.offset[0] = (uint32_t) cdi->cdi_idt.ptr; - idtptr64.offset[1] = KERNEL_UBER_BASE_HI32; - - /* Make sure busy bit is cleared in the TSS */ gdt_desc_p(KERNEL_TSS)->access &= ~ACC_TSS_BUSY; - ml_load_desc64(); -#else /* Load the GDT, LDT, IDT and TSS */ - cdi->cdi_gdt.size = sizeof(struct real_descriptor)*GDTSZ - 1; - cdi->cdi_idt.size = 0x1000 + cdp->cpu_number; - lgdt((unsigned long *) &cdi->cdi_gdt); - lidt((unsigned long *) &cdi->cdi_idt); + cdi->cdi_gdtb.size = sizeof(struct real_descriptor)*GDTSZ - 1; + cdi->cdi_gdtu.size = cdi->cdi_gdtb.size; + cdi->cdi_idtb.size = 0x1000 + cdp->cpu_number; + cdi->cdi_idtu.size = cdi->cdi_idtb.size; + + postcode(CPU_DESC_LOAD_GDT); + lgdt((uintptr_t *) &cdi->cdi_gdtu); + postcode(CPU_DESC_LOAD_IDT); + lidt((uintptr_t *) &cdi->cdi_idtu); + postcode(CPU_DESC_LOAD_LDT); lldt(KERNEL_LDT); + postcode(CPU_DESC_LOAD_TSS); set_tr(KERNEL_TSS); - /* Stuff the pre-cpu data area into the MSR and swapgs to activate */ - wrmsr64(MSR_IA32_KERNEL_GS_BASE, (unsigned long)cdp); #if GPROF // Hack to enable mcount to work on K64 __asm__ volatile("mov %0, %%gs" : : "rm" ((unsigned short)(KERNEL_DS))); #endif - swapgs(); - - cpu_mode_init(cdp); -#endif + postcode(CPU_DESC_LOAD_EXIT); } -#ifdef __i386__ -/* - * Set MSRs for sysenter/sysexit for 32-bit. - */ -static void -fast_syscall_init(__unused cpu_data_t *cdp) -{ - wrmsr(MSR_IA32_SYSENTER_CS, SYSENTER_CS, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, HI_TEXT(hi_sysenter), 0); - wrmsr(MSR_IA32_SYSENTER_ESP, current_sstk(), 0); -} -#endif - /* * Set MSRs for sysenter/sysexit and syscall/sysret for 64-bit. */ -static void -fast_syscall_init64(__unused cpu_data_t *cdp) +void +cpu_syscall_init(cpu_data_t *cdp) { +#if MONOTONIC + mt_cpu_up(cdp); +#else /* MONOTONIC */ +#pragma unused(cdp) +#endif /* !MONOTONIC */ wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS); - wrmsr64(MSR_IA32_SYSENTER_EIP, UBER64((uintptr_t) hi64_sysenter)); - wrmsr64(MSR_IA32_SYSENTER_ESP, UBER64(current_sstk())); + wrmsr64(MSR_IA32_SYSENTER_EIP, DBLMAP((uintptr_t) hi64_sysenter)); + wrmsr64(MSR_IA32_SYSENTER_ESP, current_cpu_datap()->cpu_desc_index.cdi_sstku); /* Enable syscall/sysret */ wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_SCE); @@ -749,9 +527,8 @@ fast_syscall_init64(__unused cpu_data_t *cdp) * Note USER_CS because sysret uses this + 16 when returning to * 64-bit code. */ - wrmsr64(MSR_IA32_LSTAR, UBER64((uintptr_t) hi64_syscall)); - wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) | - (((uint64_t)KERNEL64_CS) << 32)); + wrmsr64(MSR_IA32_LSTAR, DBLMAP((uintptr_t) hi64_syscall)); + wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) | (((uint64_t)KERNEL64_CS) << 32)); /* * Emulate eflags cleared by sysenter but note that * we also clear the trace trap to avoid the complications @@ -761,20 +538,9 @@ fast_syscall_init64(__unused cpu_data_t *cdp) */ wrmsr64(MSR_IA32_FMASK, EFL_DF|EFL_IF|EFL_TF|EFL_NT); -#ifdef __i386__ - /* - * Set the Kernel GS base MSR to point to per-cpu data in uber-space. - * The uber-space handler (hi64_syscall) uses the swapgs instruction. - */ - wrmsr64(MSR_IA32_KERNEL_GS_BASE, UBER64(cdp)); - -#if ONLY_SAFE_FOR_LINDA_SERIAL - kprintf("fast_syscall_init64() KERNEL_GS_BASE=0x%016llx\n", - rdmsr64(MSR_IA32_KERNEL_GS_BASE)); -#endif -#endif } - +extern vm_offset_t dyn_dblmap(vm_offset_t, vm_offset_t); +uint64_t ldt_alias_offset; cpu_data_t * cpu_data_alloc(boolean_t is_boot_cpu) @@ -784,7 +550,7 @@ cpu_data_alloc(boolean_t is_boot_cpu) if (is_boot_cpu) { assert(real_ncpus == 1); - cdp = &cpu_data_master; + cdp = cpu_datap(0); if (cdp->cpu_processor == NULL) { simple_lock_init(&ncpus_lock, 0); cdp->cpu_processor = cpu_processor_alloc(TRUE); @@ -795,68 +561,91 @@ cpu_data_alloc(boolean_t is_boot_cpu) return cdp; } + boolean_t do_ldt_alloc = FALSE; + simple_lock(&ncpus_lock); + int cnum = real_ncpus; + real_ncpus++; + if (dyn_ldts == NULL) { + do_ldt_alloc = TRUE; + } + simple_unlock(&ncpus_lock); + /* * Allocate per-cpu data: */ - ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp, sizeof(cpu_data_t)); - if (ret != KERN_SUCCESS) { - printf("cpu_data_alloc() failed, ret=%d\n", ret); - goto abort; - } + + cdp = &scdatas[cnum]; bzero((void*) cdp, sizeof(cpu_data_t)); cdp->cpu_this = cdp; - - /* Propagate mode */ - cdp->cpu_is64bit = cpu_mode_is64bit(); - + cdp->cpu_number = cnum; + cdp->cd_shadow = &cpshadows[cnum]; /* * Allocate interrupt stack: */ ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp->cpu_int_stack_top, - INTSTACK_SIZE); + INTSTACK_SIZE, VM_KERN_MEMORY_CPU); if (ret != KERN_SUCCESS) { - printf("cpu_data_alloc() int stack failed, ret=%d\n", ret); - goto abort; + panic("cpu_data_alloc() int stack failed, ret=%d\n", ret); } bzero((void*) cdp->cpu_int_stack_top, INTSTACK_SIZE); cdp->cpu_int_stack_top += INTSTACK_SIZE; /* * Allocate descriptor table: - * Size depends on cpu mode. */ - ret = kmem_alloc(kernel_map, - (vm_offset_t *) &cdp->cpu_desc_tablep, - cdp->cpu_is64bit ? sizeof(cpu_desc_table64_t) - : sizeof(cpu_desc_table_t)); - if (ret != KERN_SUCCESS) { - printf("cpu_data_alloc() desc_table failed, ret=%d\n", ret); - goto abort; - } + cdp->cpu_desc_tablep = (struct cpu_desc_table *) &scdtables[cnum]; /* * Allocate LDT */ - ret = kmem_alloc(kernel_map, - (vm_offset_t *) &cdp->cpu_ldtp, - sizeof(struct real_descriptor) * LDTSZ); - if (ret != KERN_SUCCESS) { - printf("cpu_data_alloc() ldt failed, ret=%d\n", ret); - goto abort; + if (do_ldt_alloc) { + boolean_t do_ldt_free = FALSE; + vm_offset_t sldtoffset = 0; + /* + * Allocate LDT + */ + vm_offset_t ldtalloc = 0, ldtallocsz = round_page_64(MAX_CPUS * sizeof(struct real_descriptor) * LDTSZ); + ret = kmem_alloc(kernel_map, (vm_offset_t *) &ldtalloc, ldtallocsz, VM_KERN_MEMORY_CPU); + if (ret != KERN_SUCCESS) { + panic("cpu_data_alloc() ldt failed, kmem_alloc=%d\n", ret); + } + + simple_lock(&ncpus_lock); + if (dyn_ldts == NULL) { + dyn_ldts = (cldt_t *)ldtalloc; + } else { + do_ldt_free = TRUE; + } + simple_unlock(&ncpus_lock); + + if (do_ldt_free) { + kmem_free(kernel_map, ldtalloc, ldtallocsz); + } else { + /* CPU registration and startup are expected to execute + * serially, as invoked by the platform driver. + * Create trampoline alias of LDT region. + */ + sldtoffset = dyn_dblmap(ldtalloc, ldtallocsz); + ldt_alias_offset = sldtoffset; + } } + cdp->cpu_ldtp = &dyn_ldts[cnum].pcldts[0]; #if CONFIG_MCA /* Machine-check shadow register allocation. */ mca_cpu_alloc(cdp); #endif - simple_lock(&ncpus_lock); - - cpu_data_ptr[real_ncpus] = cdp; - cdp->cpu_number = real_ncpus; - real_ncpus++; - simple_unlock(&ncpus_lock); + /* + * Before this cpu has been assigned a real thread context, + * we give it a fake, unique, non-zero thread id which the locking + * primitives use as their lock value. + * Note that this does not apply to the boot processor, cpu 0, which + * transitions to a thread context well before other processors are + * started. + */ + cdp->cpu_active_thread = (thread_t) (uintptr_t) cdp->cpu_number; cdp->cpu_nanotime = &pal_rtc_nanotime_info; @@ -865,20 +654,10 @@ cpu_data_alloc(boolean_t is_boot_cpu) "int_stack: 0x%lx-0x%lx\n", cdp->cpu_number, cdp, cdp->cpu_desc_tablep, cdp->cpu_ldtp, (long)(cdp->cpu_int_stack_top - INTSTACK_SIZE), (long)(cdp->cpu_int_stack_top)); + cpu_data_ptr[cnum] = cdp; return cdp; -abort: - if (cdp) { - if (cdp->cpu_desc_tablep) - kfree((void *) cdp->cpu_desc_tablep, - sizeof(*cdp->cpu_desc_tablep)); - if (cdp->cpu_int_stack_top) - kfree((void *) (cdp->cpu_int_stack_top - INTSTACK_SIZE), - INTSTACK_SIZE); - kfree((void *) cdp, sizeof(*cdp)); - } - return NULL; } boolean_t @@ -895,7 +674,6 @@ valid_user_data_selector(uint16_t selector) if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) return (TRUE); } - return (FALSE); } @@ -914,6 +692,13 @@ valid_user_code_selector(uint16_t selector) else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) return (TRUE); + /* Explicitly validate the system code selectors + * even if not instantaneously privileged, + * since they are dynamically re-privileged + * at context switch + */ + if ((selector == USER_CS) || (selector == USER64_CS)) + return (TRUE); } return (FALSE); @@ -976,7 +761,7 @@ cpu_userwindow_init(int cpu) if (vm_allocate(kernel_map, &vaddr, (NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE, - VM_FLAGS_ANYWHERE) != KERN_SUCCESS) + VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_CPU)) != KERN_SUCCESS) panic("cpu_userwindow_init: " "couldn't allocate user map window"); @@ -1013,13 +798,6 @@ cpu_userwindow_init(int cpu) */ cdp->cpu_copywindow_pdp = pmap_pde(kernel_pmap, user_window); -#ifdef __i386__ - cpu_desc_index_t *cdi = &cdp->cpu_desc_index; - cdi->cdi_gdt.ptr[sel_idx(USER_WINDOW_SEL)] = userwindow_desc_pattern; - cdi->cdi_gdt.ptr[sel_idx(USER_WINDOW_SEL)].offset = user_window; - - fix_desc(&cdi->cdi_gdt.ptr[sel_idx(USER_WINDOW_SEL)], 1); -#endif /* __i386__ */ } void @@ -1030,7 +808,7 @@ cpu_physwindow_init(int cpu) if (phys_window == 0) { if (vm_allocate(kernel_map, &phys_window, - PAGE_SIZE, VM_FLAGS_ANYWHERE) + PAGE_SIZE, VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_CPU)) != KERN_SUCCESS) panic("cpu_physwindow_init: " "couldn't allocate phys map window"); @@ -1040,40 +818,61 @@ cpu_physwindow_init(int cpu) * pte pointer we're interested in actually * exists in the page table */ - pmap_expand(kernel_pmap, phys_window); + pmap_expand(kernel_pmap, phys_window, PMAP_EXPAND_OPTIONS_NONE); cdp->cpu_physwindow_base = phys_window; cdp->cpu_physwindow_ptep = vtopte(phys_window); } -#ifdef __i386__ - cpu_desc_index_t *cdi = &cdp->cpu_desc_index; - cdi->cdi_gdt.ptr[sel_idx(PHYS_WINDOW_SEL)] = physwindow_desc_pattern; - cdi->cdi_gdt.ptr[sel_idx(PHYS_WINDOW_SEL)].offset = phys_window; - - fix_desc(&cdi->cdi_gdt.ptr[sel_idx(PHYS_WINDOW_SEL)], 1); -#endif /* __i386__ */ } #endif /* NCOPY_WINDOWS > 0 */ /* - * Load the segment descriptor tables for the current processor. + * Allocate a new interrupt stack for the boot processor from the + * heap rather than continue to use the statically allocated space. + * Also switch to a dynamically allocated cpu data area. */ void -cpu_mode_init(cpu_data_t *cdp) +cpu_data_realloc(void) { -#ifdef __i386__ - if (cdp->cpu_is64bit) { - cpu_IA32e_enable(cdp); - cpu_desc_load64(cdp); - fast_syscall_init64(cdp); - } else { - fast_syscall_init(cdp); + int ret; + vm_offset_t istk; + cpu_data_t *cdp; + boolean_t istate; + + ret = kmem_alloc(kernel_map, &istk, INTSTACK_SIZE, VM_KERN_MEMORY_CPU); + if (ret != KERN_SUCCESS) { + panic("cpu_data_realloc() stack alloc, ret=%d\n", ret); } -#else - fast_syscall_init64(cdp); -#endif + bzero((void*) istk, INTSTACK_SIZE); + istk += INTSTACK_SIZE; - /* Call for per-cpu pmap mode initialization */ - pmap_cpu_init(); -} + cdp = &scdatas[0]; + + /* Copy old contents into new area and make fix-ups */ + assert(cpu_number() == 0); + bcopy((void *) cpu_data_ptr[0], (void*) cdp, sizeof(cpu_data_t)); + cdp->cpu_this = cdp; + cdp->cpu_int_stack_top = istk; + timer_call_queue_init(&cdp->rtclock_timer.queue); + cdp->cpu_desc_tablep = (struct cpu_desc_table *) &scdtables[0]; + cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; + uint8_t *cfstk = &scfstks[cdp->cpu_number].fstk[0]; + cdt->fstkp = cfstk; + cfstk += FSTK_SZ; + + /* + * With interrupts disabled commmit the new areas. + */ + istate = ml_set_interrupts_enabled(FALSE); + cpu_data_ptr[0] = cdp; + master_ktss64.ist2 = DBLMAP((uintptr_t) cfstk); + master_ktss64.ist1 = DBLMAP((uintptr_t) cfstk - sizeof(x86_64_intr_stack_frame_t)); + wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); + wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); + (void) ml_set_interrupts_enabled(istate); + + kprintf("Reallocated master cpu data: %p," + " interrupt stack: %p, fault stack: %p\n", + (void *) cdp, (void *) istk, (void *) cfstk); +}