X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/c6bf4f310a33a9262d455ea4d3f0630b1255e3fe..HEAD:/osfmk/arm64/machine_routines.c diff --git a/osfmk/arm64/machine_routines.c b/osfmk/arm64/machine_routines.c index 037f34c13..3b616122b 100644 --- a/osfmk/arm64/machine_routines.c +++ b/osfmk/arm64/machine_routines.c @@ -42,83 +42,133 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include +#include +#include +#include #include #include #include #include +#if HIBERNATION +#include +#endif /* HIBERNATION */ #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) -#include +#include #endif #include +/** + * On supported hardware, debuggable builds make the HID bits read-only + * without locking them. This lets people manually modify HID bits while + * debugging, since they can use a debugging tool to first reset the HID + * bits back to read/write. However it will still catch xnu changes that + * accidentally write to HID bits after they've been made read-only. + */ +#if HAS_TWO_STAGE_SPR_LOCK && !(DEVELOPMENT || DEBUG) +#define USE_TWO_STAGE_SPR_LOCK +#endif + #if KPC #include #endif +#define MPIDR_CPU_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT) +#define MPIDR_CLUSTER_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT) + #if HAS_CLUSTER static uint8_t cluster_initialized = 0; #endif - -static int max_cpus_initialized = 0; -#define MAX_CPUS_SET 0x1 -#define MAX_CPUS_WAIT 0x2 - uint32_t LockTimeOut; uint32_t LockTimeOutUsec; uint64_t TLockTimeOut; uint64_t MutexSpin; -boolean_t is_clock_configured = FALSE; +uint64_t low_MutexSpin; +int64_t high_MutexSpin; -uint32_t yield_delay_us = 0; /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */ +static uint64_t ml_wfe_hint_max_interval; +#define MAX_WFE_HINT_INTERVAL_US (500ULL) -#if CONFIG_NONFATAL_ASSERTS -extern int mach_assert; -#endif -extern volatile uint32_t debug_enabled; +/* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */ +TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0); extern vm_offset_t segLOWEST; extern vm_offset_t segLOWESTTEXT; extern vm_offset_t segLASTB; extern unsigned long segSizeLAST; +/* ARM64 specific bounds; used to test for presence in the kernelcache. */ +extern vm_offset_t vm_kernelcache_base; +extern vm_offset_t vm_kernelcache_top; + #if defined(HAS_IPI) unsigned int gFastIPI = 1; #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */ -static uint64_t deferred_ipi_timer_ns = kDeferredIPITimerDefault; +static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout", + kDeferredIPITimerDefault); #endif /* defined(HAS_IPI) */ -void machine_conf(void); - thread_t Idle_context(void); -SECURITY_READ_ONLY_LATE(static uint32_t) cpu_phys_ids[MAX_CPUS] = {[0 ... MAX_CPUS - 1] = (uint32_t)-1}; -SECURITY_READ_ONLY_LATE(static unsigned int) avail_cpus = 0; -SECURITY_READ_ONLY_LATE(static int) boot_cpu = -1; -SECURITY_READ_ONLY_LATE(static int) max_cpu_number = 0; -SECURITY_READ_ONLY_LATE(cluster_type_t) boot_cluster = CLUSTER_TYPE_SMP; +SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS]; +SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS]; +SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = { + .version = CPU_TOPOLOGY_VERSION, + .cpus = topology_cpu_array, + .clusters = topology_cluster_array, +}; +/** + * Represents the offset of each cluster within a hypothetical array of MAX_CPUS + * entries of an arbitrary data type. This is intended for use by specialized consumers + * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1), + * as follows: + * hypothetical_array[cluster_offsets[AFF1] + AFF0] + * Most consumers should instead use general-purpose facilities such as PERCPU or + * ml_get_cpu_number(). + */ +SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1]; + +SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX; -SECURITY_READ_ONLY_LATE(static uint32_t) fiq_eventi = UINT32_MAX; +extern uint32_t lockdown_done; -lockdown_handler_t lockdown_handler; -void *lockdown_this; -lck_mtx_t lockdown_handler_lck; -lck_grp_t *lockdown_handler_grp; -int lockdown_done; +/** + * Represents regions of virtual address space that should be reserved + * (pre-mapped) in each user address space. + */ +SECURITY_READ_ONLY_LATE(static struct vm_reserved_region) vm_reserved_regions[] = { + { + .vmrr_name = "GPU Carveout", + .vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS, + .vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS) + }, + /* + * Reserve the virtual memory space representing the commpage nesting region + * to prevent user processes from allocating memory within it. The actual + * page table entries for the commpage are inserted by vm_commpage_enter(). + * This vm_map_enter() just prevents userspace from allocating/deallocating + * anything within the entire commpage nested region. + */ + { + .vmrr_name = "commpage nesting", + .vmrr_addr = _COMM_PAGE64_NESTING_START, + .vmrr_size = _COMM_PAGE64_NESTING_SIZE + } +}; -void ml_lockdown_init(void); -void ml_lockdown_run_handler(void); uint32_t get_arm_cpu_version(void); #if defined(HAS_IPI) @@ -132,17 +182,17 @@ ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type) * to a single CPU. Otherwise we may migrate between choosing which * IPI mechanism to use and issuing the IPI. */ MRS(local_mpidr, "MPIDR_EL1"); - if ((local_mpidr & MPIDR_AFF1_MASK) == (cpu_mpidr & MPIDR_AFF1_MASK)) { - uint64_t x = type | (cpu_mpidr & MPIDR_AFF0_MASK); - MSR(ARM64_REG_IPI_RR_LOCAL, x); + if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) { + uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr); + MSR("S3_5_C15_C0_0", x); } else { #define IPI_RR_TARGET_CLUSTER_SHIFT 16 - uint64_t x = type | ((cpu_mpidr & MPIDR_AFF1_MASK) << (IPI_RR_TARGET_CLUSTER_SHIFT - MPIDR_AFF1_SHIFT)) | (cpu_mpidr & MPIDR_AFF0_MASK); - MSR(ARM64_REG_IPI_RR_GLOBAL, x); + uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr); + MSR("S3_5_C15_C0_1", x); } #else - uint64_t x = type | (cpu_mpidr & MPIDR_AFF0_MASK); - MSR(ARM64_REG_IPI_RR, x); + uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr); + MSR("S3_5_C15_C0_1", x); #endif } #endif @@ -186,7 +236,7 @@ ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs) /* update deferred_ipi_timer_ns with the new clamped value */ absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns); - MSR(ARM64_REG_IPI_CR, abstime); + MSR("S3_5_C15_C3_1", abstime); #else (void)nanosecs; panic("Platform does not support ACC Fast IPI"); @@ -232,23 +282,14 @@ ml_cpu_signal_retract(unsigned int cpu_mpidr __unused) void machine_idle(void) { - __builtin_arm_wsr("DAIFSet", (DAIFSC_IRQF | DAIFSC_FIQF)); + /* Interrupts are expected to be masked on entry or re-entry via + * Idle_load_context() + */ + assert((__builtin_arm_rsr("DAIF") & DAIF_IRQF) == DAIF_IRQF); Idle_context(); __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF)); } -void -init_vfp(void) -{ - return; -} - -boolean_t -get_vfp_enabled(void) -{ - return TRUE; -} - void OSSynchronizeIO(void) { @@ -312,10 +353,21 @@ get_arm_cpu_version(void) return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4)); } +bool +ml_feature_supported(uint32_t feature_bit) +{ + uint64_t aidr_el1_value = 0; + + MRS(aidr_el1_value, "AIDR_EL1"); + + + return aidr_el1_value & feature_bit; +} + /* * user_cont_hwclock_allowed() * - * Indicates whether we allow EL0 to read the physical timebase (CNTPCT_EL0) + * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0) * as a continuous time source (e.g. from mach_continuous_time) */ boolean_t @@ -335,331 +387,15 @@ user_timebase_type(void) return USER_TIMEBASE_SPEC; } -boolean_t -arm64_wfe_allowed(void) -{ - return TRUE; -} - -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) - -uint64_t rorgn_begin __attribute__((section("__DATA, __const"))) = 0; -uint64_t rorgn_end __attribute__((section("__DATA, __const"))) = 0; -vm_offset_t amcc_base; - -static void assert_unlocked(void); -static void assert_amcc_cache_disabled(void); -static void lock_amcc(void); -static void lock_mmu(uint64_t begin, uint64_t end); - -void -rorgn_stash_range(void) -{ -#if DEVELOPMENT || DEBUG - boolean_t rorgn_disable = FALSE; - - PE_parse_boot_argn("-unsafe_kernel_text", &rorgn_disable, sizeof(rorgn_disable)); - - if (rorgn_disable) { - /* take early out if boot arg present, don't query any machine registers to avoid - * dependency on amcc DT entry - */ - return; - } -#endif - - /* Get the AMC values, and stash them into rorgn_begin, rorgn_end. - * gPhysBase is the base of DRAM managed by xnu. we need DRAM_BASE as - * the AMCC RO region begin/end registers are in units of 16KB page - * numbers from DRAM_BASE so we'll truncate gPhysBase at 512MB granule - * and assert the value is the canonical DRAM_BASE PA of 0x8_0000_0000 for arm64. - */ - - uint64_t dram_base = gPhysBase & ~0x1FFFFFFFULL; /* 512MB */ - assert(dram_base == 0x800000000ULL); - -#if defined(KERNEL_INTEGRITY_KTRR) - uint64_t soc_base = 0; - DTEntry entryP = NULL; - uintptr_t *reg_prop = NULL; - uint32_t prop_size = 0; - int rc; - - soc_base = pe_arm_get_soc_base_phys(); - rc = DTFindEntry("name", "mcc", &entryP); - assert(rc == kSuccess); - rc = DTGetProperty(entryP, "reg", (void **)®_prop, &prop_size); - assert(rc == kSuccess); - amcc_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1)); -#elif defined(KERNEL_INTEGRITY_CTRR) - /* TODO: t8020 mcc entry not in device tree yet; we'll do it LIVE */ -#define TEMP_AMCC_BASE_PA 0x200000000ULL -#define TEMP_AMCC_SZ 0x100000 - amcc_base = ml_io_map(TEMP_AMCC_BASE_PA, TEMP_AMCC_SZ); -#else -#error "KERNEL_INTEGRITY config error" -#endif - -#if defined(KERNEL_INTEGRITY_KTRR) - assert(rRORGNENDADDR > rRORGNBASEADDR); - rorgn_begin = (rRORGNBASEADDR << AMCC_PGSHIFT) + dram_base; - rorgn_end = (rRORGNENDADDR << AMCC_PGSHIFT) + dram_base; -#elif defined(KERNEL_INTEGRITY_CTRR) - rorgn_begin = rCTRR_AMCC_PLANE_REG(0, CTRR_A_BASEADDR); - rorgn_end = rCTRR_AMCC_PLANE_REG(0, CTRR_A_ENDADDR); - assert(rorgn_end > rorgn_begin); - - for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) { - uint32_t begin = rCTRR_AMCC_PLANE_REG(i, CTRR_A_BASEADDR); - uint32_t end = rCTRR_AMCC_PLANE_REG(i, CTRR_A_ENDADDR); - if (!(begin == rorgn_begin && end == rorgn_end)) { -#if DEVELOPMENT || DEBUG - panic("iboot programmed CTRR bounds are inconsistent"); -#else - panic("Inconsistent memory configuration"); -#endif - } - } - - // convert from page number from DRAM base to PA - rorgn_begin = (rorgn_begin << AMCC_PGSHIFT) + dram_base; - rorgn_end = (rorgn_end << AMCC_PGSHIFT) + dram_base; - -#else -#error KERNEL_INTEGRITY config error -#endif /* defined (KERNEL_INTEGRITY_KTRR) */ -} - -static void -assert_unlocked() -{ - uint64_t ktrr_lock = 0; - uint32_t rorgn_lock = 0; - - assert(amcc_base); -#if defined(KERNEL_INTEGRITY_KTRR) - rorgn_lock = rRORGNLOCK; - ktrr_lock = __builtin_arm_rsr64(ARM64_REG_KTRR_LOCK_EL1); -#elif defined(KERNEL_INTEGRITY_CTRR) - for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) { - rorgn_lock |= rCTRR_AMCC_PLANE_REG(i, CTRR_A_LOCK); - } - ktrr_lock = __builtin_arm_rsr64(ARM64_REG_CTRR_LOCK_EL1); -#else -#error KERNEL_INTEGRITY config error -#endif /* defined(KERNEL_INTEGRITY_KTRR) */ - - assert(!ktrr_lock); - assert(!rorgn_lock); -} - -static void -lock_amcc() -{ -#if defined(KERNEL_INTEGRITY_KTRR) - rRORGNLOCK = 1; - __builtin_arm_isb(ISB_SY); -#elif defined(KERNEL_INTEGRITY_CTRR) - /* lockdown planes in reverse order as plane 0 should be locked last */ - for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) { - rCTRR_AMCC_PLANE_REG(CTRR_AMCC_MAX_PLANES - i - 1, CTRR_A_ENABLE) = 1; - rCTRR_AMCC_PLANE_REG(CTRR_AMCC_MAX_PLANES - i - 1, CTRR_A_LOCK) = 1; - __builtin_arm_isb(ISB_SY); - } -#else -#error KERNEL_INTEGRITY config error -#endif -} - -static void -lock_mmu(uint64_t begin, uint64_t end) -{ -#if defined(KERNEL_INTEGRITY_KTRR) - - __builtin_arm_wsr64(ARM64_REG_KTRR_LOWER_EL1, begin); - __builtin_arm_wsr64(ARM64_REG_KTRR_UPPER_EL1, end); - __builtin_arm_wsr64(ARM64_REG_KTRR_LOCK_EL1, 1ULL); - - /* flush TLB */ - - __builtin_arm_isb(ISB_SY); - flush_mmu_tlb(); - -#elif defined (KERNEL_INTEGRITY_CTRR) - /* this will lock the entire bootstrap cluster. non bootstrap clusters - * will be locked by respective cluster master in start.s */ - - __builtin_arm_wsr64(ARM64_REG_CTRR_A_LWR_EL1, begin); - __builtin_arm_wsr64(ARM64_REG_CTRR_A_UPR_EL1, end); - -#if !defined(APPLEVORTEX) - /* H12 changed sequence, must invalidate TLB immediately after setting CTRR bounds */ - __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */ - flush_mmu_tlb(); -#endif /* !defined(APPLEVORTEX) */ - - __builtin_arm_wsr64(ARM64_REG_CTRR_CTL_EL1, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT); - __builtin_arm_wsr64(ARM64_REG_CTRR_LOCK_EL1, 1ULL); - - uint64_t current_el = __builtin_arm_rsr64("CurrentEL"); - if (current_el == PSR64_MODE_EL2) { - // CTRR v2 has explicit registers for cluster config. they can only be written in EL2 - - __builtin_arm_wsr64(ACC_CTRR_A_LWR_EL2, begin); - __builtin_arm_wsr64(ACC_CTRR_A_UPR_EL2, end); - __builtin_arm_wsr64(ACC_CTRR_CTL_EL2, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT); - __builtin_arm_wsr64(ACC_CTRR_LOCK_EL2, 1ULL); - } - - __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */ -#if defined(APPLEVORTEX) - flush_mmu_tlb(); -#endif /* defined(APPLEVORTEX) */ - -#else /* defined(KERNEL_INTEGRITY_KTRR) */ -#error KERNEL_INTEGRITY config error -#endif /* defined(KERNEL_INTEGRITY_KTRR) */ -} - -static void -assert_amcc_cache_disabled() -{ -#if defined(KERNEL_INTEGRITY_KTRR) - assert((rMCCGEN & 1) == 0); /* assert M$ disabled or LLC clean will be unreliable */ -#elif defined(KERNEL_INTEGRITY_CTRR) && (defined(ARM64_BOARD_CONFIG_T8006)) - /* - * T8006 differentiates between data and tag ways being powered up, so - * make sure to check that both are zero on its single memory plane. - */ - assert((rCTRR_AMCC_PLANE_REG(0, CTRR_AMCC_PWRONWAYCNTSTATUS) & - (AMCC_CURTAGWAYCNT_MASK | AMCC_CURDATWAYCNT_MASK)) == 0); -#elif defined (KERNEL_INTEGRITY_CTRR) - for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) { - assert(rCTRR_AMCC_PLANE_REG(i, CTRR_AMCC_WAYONCNT) == 0); - } -#else -#error KERNEL_INTEGRITY config error -#endif -} - -/* - * void rorgn_lockdown(void) - * - * Lock the MMU and AMCC RORegion within lower and upper boundaries if not already locked - * - * [ ] - ensure this is being called ASAP on secondary CPUs: KTRR programming and lockdown handled in - * start.s:start_cpu() for subsequent wake/resume of all cores - */ -void -rorgn_lockdown(void) -{ - vm_offset_t ktrr_begin, ktrr_end; - unsigned long last_segsz; - -#if DEVELOPMENT || DEBUG - boolean_t ktrr_disable = FALSE; - - PE_parse_boot_argn("-unsafe_kernel_text", &ktrr_disable, sizeof(ktrr_disable)); - - if (ktrr_disable) { - /* - * take early out if boot arg present, since we may not have amcc DT entry present - * we can't assert that iboot hasn't programmed the RO region lockdown registers - */ - goto out; - } -#endif /* DEVELOPMENT || DEBUG */ - - assert_unlocked(); - - /* [x] - Use final method of determining all kernel text range or expect crashes */ - ktrr_begin = segLOWEST; - assert(ktrr_begin && gVirtBase && gPhysBase); - - ktrr_begin = kvtophys(ktrr_begin); - - ktrr_end = kvtophys(segLASTB); - last_segsz = segSizeLAST; -#if defined(KERNEL_INTEGRITY_KTRR) - /* __LAST is not part of the MMU KTRR region (it is however part of the AMCC KTRR region) */ - ktrr_end = (ktrr_end - 1) & ~AMCC_PGMASK; - /* ensure that iboot and xnu agree on the ktrr range */ - assert(rorgn_begin == ktrr_begin && rorgn_end == (ktrr_end + last_segsz)); - /* assert that __LAST segment containing privileged insns is only a single page */ - assert(last_segsz == PAGE_SIZE); -#elif defined(KERNEL_INTEGRITY_CTRR) - ktrr_end = (ktrr_end + last_segsz - 1) & ~AMCC_PGMASK; - /* __LAST is part of MMU CTRR region. Can't use the KTRR style method of making - * __pinst no execute because PXN applies with MMU off in CTRR. */ - assert(rorgn_begin == ktrr_begin && rorgn_end == ktrr_end); -#endif - - -#if DEBUG || DEVELOPMENT - printf("KTRR Begin: %p End: %p, setting lockdown\n", (void *)ktrr_begin, (void *)ktrr_end); -#endif - - /* [x] - ensure all in flight writes are flushed to AMCC before enabling RO Region Lock */ - - assert_amcc_cache_disabled(); - - CleanPoC_DcacheRegion_Force(phystokv(ktrr_begin), - (unsigned)((ktrr_end + last_segsz) - ktrr_begin + AMCC_PGMASK)); - - lock_amcc(); - - lock_mmu(ktrr_begin, ktrr_end); - -#if DEVELOPMENT || DEBUG -out: -#endif - -#if defined(KERNEL_INTEGRITY_CTRR) - { - /* wake any threads blocked on cluster master lockdown */ - cpu_data_t *cdp; - uint64_t mpidr_el1_value; - - cdp = getCpuDatap(); - MRS(mpidr_el1_value, "MPIDR_EL1"); - cdp->cpu_cluster_id = (mpidr_el1_value & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT; - assert(cdp->cpu_cluster_id < __ARM_CLUSTER_COUNT__); - ctrr_cluster_locked[cdp->cpu_cluster_id] = 1; - thread_wakeup(&ctrr_cluster_locked[cdp->cpu_cluster_id]); - } -#endif - /* now we can run lockdown handler */ - ml_lockdown_run_handler(); -} - -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ - void machine_startup(__unused boot_args * args) { - int boot_arg; - #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG) if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) { gFastIPI = 1; } - - PE_parse_boot_argn("fastipitimeout", &deferred_ipi_timer_ns, sizeof(deferred_ipi_timer_ns)); #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/ -#if CONFIG_NONFATAL_ASSERTS - PE_parse_boot_argn("assert", &mach_assert, sizeof(mach_assert)); -#endif - - if (PE_parse_boot_argn("preempt", &boot_arg, sizeof(boot_arg))) { - default_preemption_rate = boot_arg; - } - if (PE_parse_boot_argn("bg_preempt", &boot_arg, sizeof(boot_arg))) { - default_bg_preemption_rate = boot_arg; - } - - PE_parse_boot_argn("yield_delay_us", &yield_delay_us, sizeof(yield_delay_us)); - machine_conf(); /* @@ -669,21 +405,27 @@ machine_startup(__unused boot_args * args) /* NOTREACHED */ } +typedef void (*invalidate_fn_t)(void); + +static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL; + +void set_invalidate_hmac_function(invalidate_fn_t fn); + void -machine_lockdown_preflight(void) +set_invalidate_hmac_function(invalidate_fn_t fn) { -#if CONFIG_KERNEL_INTEGRITY - -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) - rorgn_stash_range(); -#endif + if (NULL != invalidate_hmac_function) { + panic("Invalidate HMAC function already set"); + } -#endif + invalidate_hmac_function = fn; } void machine_lockdown(void) { + arm_vm_prot_finalize(PE_state.bootArgs); + #if CONFIG_KERNEL_INTEGRITY #if KERNEL_INTEGRITY_WT /* Watchtower @@ -714,8 +456,16 @@ machine_lockdown(void) #endif /* CONFIG_KERNEL_INTEGRITY */ + + + if (NULL != invalidate_hmac_function) { + invalidate_hmac_function(); + } + + lockdown_done = 1; } + char * machine_boot_info( __unused char *buf, @@ -724,26 +474,6 @@ machine_boot_info( return PE_boot_args(); } -void -machine_conf(void) -{ - /* - * This is known to be inaccurate. mem_size should always be capped at 2 GB - */ - machine_info.memory_size = (uint32_t)mem_size; -} - -void -machine_init(void) -{ - debug_log_init(); - clock_config(); - is_clock_configured = TRUE; - if (debug_enabled) { - pmap_map_globals(); - } -} - void slave_machine_init(__unused void *param) { @@ -764,46 +494,6 @@ machine_processor_shutdown( return Shutdown_context(doshutdown, processor); } -/* - * Routine: ml_init_max_cpus - * Function: - */ -void -ml_init_max_cpus(unsigned int max_cpus) -{ - boolean_t current_state; - - current_state = ml_set_interrupts_enabled(FALSE); - if (max_cpus_initialized != MAX_CPUS_SET) { - machine_info.max_cpus = max_cpus; - machine_info.physical_cpu_max = max_cpus; - machine_info.logical_cpu_max = max_cpus; - if (max_cpus_initialized == MAX_CPUS_WAIT) { - thread_wakeup((event_t) &max_cpus_initialized); - } - max_cpus_initialized = MAX_CPUS_SET; - } - (void) ml_set_interrupts_enabled(current_state); -} - -/* - * Routine: ml_get_max_cpus - * Function: - */ -unsigned int -ml_get_max_cpus(void) -{ - boolean_t current_state; - - current_state = ml_set_interrupts_enabled(FALSE); - if (max_cpus_initialized != MAX_CPUS_SET) { - max_cpus_initialized = MAX_CPUS_WAIT; - assert_wait((event_t) &max_cpus_initialized, THREAD_UNINT); - (void) thread_block(THREAD_CONTINUE_NULL); - } - (void) ml_set_interrupts_enabled(current_state); - return machine_info.max_cpus; -} /* * Routine: ml_init_lock_timeout @@ -841,6 +531,30 @@ ml_init_lock_timeout(void) nanoseconds_to_absolutetime(10 * NSEC_PER_USEC, &abstime); } MutexSpin = abstime; + low_MutexSpin = MutexSpin; + + + /* + * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but + * real_ncpus is not set at this time + * + * NOTE: active spinning is disabled in arm. It can be activated + * by setting high_MutexSpin through the sysctl. + */ + high_MutexSpin = low_MutexSpin; + + nanoseconds_to_absolutetime(MAX_WFE_HINT_INTERVAL_US * NSEC_PER_USEC, &ml_wfe_hint_max_interval); +} + +/* + * This is called when all of the ml_processor_info_t structures have been + * initialized and all the processors have been started through processor_start(). + * + * Required by the scheduler subsystem. + */ +void +ml_cpu_init_completed(void) +{ } /* @@ -999,10 +713,7 @@ ml_install_interrupt_handler( cpu_data_ptr->interrupt_handler = handler; cpu_data_ptr->interrupt_refCon = refCon; - cpu_data_ptr->interrupts_enabled = TRUE; (void) ml_set_interrupts_enabled(current_state); - - initialize_screen(NULL, kPEAcquireScreen); } /* @@ -1046,6 +757,85 @@ ml_init_timebase( } } +#define ML_READPROP_MANDATORY UINT64_MAX + +static uint64_t +ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value) +{ + void const *prop; + unsigned int propSize; + + if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) { + if (propSize == sizeof(uint8_t)) { + return *((uint8_t const *)prop); + } else if (propSize == sizeof(uint16_t)) { + return *((uint16_t const *)prop); + } else if (propSize == sizeof(uint32_t)) { + return *((uint32_t const *)prop); + } else if (propSize == sizeof(uint64_t)) { + return *((uint64_t const *)prop); + } else { + panic("CPU property '%s' has bad size %u", propertyName, propSize); + } + } else { + if (default_value == ML_READPROP_MANDATORY) { + panic("Missing mandatory property '%s'", propertyName); + } + return default_value; + } +} + +static boolean_t +ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr) +{ + uint64_t const *prop; + unsigned int propSize; + + if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) { + return FALSE; + } + + if (propSize != sizeof(uint64_t) * 2) { + panic("Wrong property size for %s", propertyName); + } + + *pa_ptr = prop[0]; + *len_ptr = prop[1]; + return TRUE; +} + +static boolean_t +ml_is_boot_cpu(const DTEntry entry) +{ + void const *prop; + unsigned int propSize; + + if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) { + panic("unable to retrieve state for cpu"); + } + + if (strncmp((char const *)prop, "running", propSize) == 0) { + return TRUE; + } else { + return FALSE; + } +} + +static void +ml_read_chip_revision(unsigned int *rev __unused) +{ + // The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds +#ifdef APPLE_ARM64_ARCH_FAMILY + DTEntry entryP; + + if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) { + *rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN); + } else { + *rev = CPU_VERSION_UNKNOWN; + } +#endif +} + void ml_parse_cpu_topology(void) { @@ -1054,59 +844,148 @@ ml_parse_cpu_topology(void) uint32_t cpu_boot_arg; int err; + int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1]; + int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1]; cpu_boot_arg = MAX_CPUS; - PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg)); - err = DTLookupEntry(NULL, "/cpus", &entry); + err = SecureDTLookupEntry(NULL, "/cpus", &entry); assert(err == kSuccess); - err = DTInitEntryIterator(entry, &iter); + err = SecureDTInitEntryIterator(entry, &iter); assert(err == kSuccess); - while (kSuccess == DTIterateEntries(&iter, &child)) { - unsigned int propSize; - void *prop = NULL; - int cpu_id = avail_cpus++; + for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) { + cluster_offsets[i] = -1; + cluster_phys_to_logical[i] = -1; + cluster_max_cpu_phys_id[i] = 0; + } + + while (kSuccess == SecureDTIterateEntries(&iter, &child)) { + boolean_t is_boot_cpu = ml_is_boot_cpu(child); - if (kSuccess == DTGetProperty(child, "cpu-id", &prop, &propSize)) { - cpu_id = *((int32_t*)prop); + // If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't + // been added to the topology struct yet, and we only have one slot left, then skip + // every other non-boot CPU in order to leave room for the boot CPU. + // + // e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[] + // array will list CPU0, CPU1, and CPU4. CPU2-CPU3 and CPU5-CPUn will be omitted. + if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) { + continue; + } + if (topology_info.num_cpus >= cpu_boot_arg) { + break; } - assert(cpu_id < MAX_CPUS); - assert(cpu_phys_ids[cpu_id] == (uint32_t)-1); + ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus]; - if (boot_cpu == -1) { - if (kSuccess != DTGetProperty(child, "state", &prop, &propSize)) { - panic("unable to retrieve state for cpu %d", cpu_id); - } + cpu->cpu_id = topology_info.num_cpus++; + assert(cpu->cpu_id < MAX_CPUS); + topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id); - if (strncmp((char*)prop, "running", propSize) == 0) { - boot_cpu = cpu_id; - } - } - if (kSuccess != DTGetProperty(child, "reg", &prop, &propSize)) { - panic("unable to retrieve physical ID for cpu %d", cpu_id); + cpu->die_id = (int)ml_readprop(child, "die-id", 0); + topology_info.max_die_id = MAX(topology_info.max_die_id, cpu->die_id); + + cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY); + + cpu->l2_access_penalty = (uint32_t)ml_readprop(child, "l2-access-penalty", 0); + cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0); + cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0); + cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0); + cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0); + + ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len); + ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len); + ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len); + cpu->cluster_type = CLUSTER_TYPE_SMP; + + int cluster_type = (int)ml_readprop(child, "cluster-type", 0); + if (cluster_type == 'E') { + cpu->cluster_type = CLUSTER_TYPE_E; + } else if (cluster_type == 'P') { + cpu->cluster_type = CLUSTER_TYPE_P; } - cpu_phys_ids[cpu_id] = *((uint32_t*)prop); + /* + * Since we want to keep a linear cluster ID space, we cannot just rely + * on the value provided by EDT. Instead, use the MPIDR value to see if we have + * seen this exact cluster before. If so, then reuse that cluster ID for this CPU. + */ +#if HAS_CLUSTER + uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id); +#else + uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P); +#endif + assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID); + cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ? + topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]); + + assert(cpu->cluster_id < MAX_CPU_CLUSTERS); + + ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id]; + if (cluster->num_cpus == 0) { + assert(topology_info.num_clusters < MAX_CPU_CLUSTERS); + + topology_info.num_clusters++; + topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id); + + cluster->cluster_id = cpu->cluster_id; + cluster->cluster_type = cpu->cluster_type; + cluster->first_cpu_id = cpu->cpu_id; + assert(cluster_phys_to_logical[phys_cluster_id] == -1); + cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id; + + // Since we don't have a per-cluster EDT node, this is repeated in each CPU node. + // If we wind up with a bunch of these, we might want to create separate per-cluster + // EDT nodes and have the CPU nodes reference them through a phandle. + ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len); + ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len); + } - if ((cpu_id > max_cpu_number) && ((cpu_id == boot_cpu) || (avail_cpus <= cpu_boot_arg))) { - max_cpu_number = cpu_id; +#if HAS_CLUSTER + if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) { + cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id); } - } +#endif - if (avail_cpus > cpu_boot_arg) { - avail_cpus = cpu_boot_arg; - } + cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id)); + cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id)); + + cluster->num_cpus++; + cluster->cpu_mask |= 1ULL << cpu->cpu_id; - if (avail_cpus == 0) { - panic("No cpus found!"); + if (is_boot_cpu) { + assert(topology_info.boot_cpu == NULL); + topology_info.boot_cpu = cpu; + topology_info.boot_cluster = cluster; + } } - if (boot_cpu == -1) { - panic("unable to determine boot cpu!"); +#if HAS_CLUSTER + /* + * Build the cluster offset array, ensuring that the region reserved + * for each physical cluster contains enough entries to be indexed + * by the maximum physical CPU ID (AFF0) within the cluster. + */ + unsigned int cur_cluster_offset = 0; + for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) { + if (cluster_phys_to_logical[i] != -1) { + cluster_offsets[i] = cur_cluster_offset; + cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1); + } } + assert(cur_cluster_offset <= MAX_CPUS); +#else + /* + * For H10, there are really 2 physical clusters, but they are not separated + * into distinct ACCs. AFF1 therefore always reports 0, and AFF0 numbering + * is linear across both clusters. For the purpose of MPIDR_EL1-based indexing, + * treat H10 and earlier devices as though they contain a single cluster. + */ + cluster_offsets[0] = 0; +#endif + assert(topology_info.boot_cpu != NULL); + ml_read_chip_revision(&topology_info.chip_revision); /* * Set TPIDRRO_EL0 to indicate the correct cpu number, as we may @@ -1116,95 +995,162 @@ ml_parse_cpu_topology(void) * per-cpu data object. */ assert(__builtin_arm_rsr64("TPIDRRO_EL0") == 0); - __builtin_arm_wsr64("TPIDRRO_EL0", (uint64_t)boot_cpu); + __builtin_arm_wsr64("TPIDRRO_EL0", (uint64_t)topology_info.boot_cpu->cpu_id); +} + +const ml_topology_info_t * +ml_get_topology_info(void) +{ + return &topology_info; +} + +void +ml_map_cpu_pio(void) +{ + unsigned int i; + + for (i = 0; i < topology_info.num_cpus; i++) { + ml_topology_cpu_t *cpu = &topology_info.cpus[i]; + if (cpu->cpu_IMPL_pa) { + cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len); + cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len); + } + if (cpu->cpu_UTTDBG_pa) { + cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len); + } + } + + for (i = 0; i < topology_info.num_clusters; i++) { + ml_topology_cluster_t *cluster = &topology_info.clusters[i]; + if (cluster->acc_IMPL_pa) { + cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len); + } + if (cluster->cpm_IMPL_pa) { + cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len); + } + } } unsigned int ml_get_cpu_count(void) { - return avail_cpus; + return topology_info.num_cpus; +} + +unsigned int +ml_get_cluster_count(void) +{ + return topology_info.num_clusters; } int ml_get_boot_cpu_number(void) { - return boot_cpu; + return topology_info.boot_cpu->cpu_id; } cluster_type_t ml_get_boot_cluster(void) { - return boot_cluster; + return topology_info.boot_cluster->cluster_type; } int ml_get_cpu_number(uint32_t phys_id) { - for (int log_id = 0; log_id <= ml_get_max_cpu_number(); ++log_id) { - if (cpu_phys_ids[log_id] == phys_id) { - return log_id; + phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK; + + for (unsigned i = 0; i < topology_info.num_cpus; i++) { + if (topology_info.cpus[i].phys_id == phys_id) { + return i; } } + return -1; } +int +ml_get_cluster_number(uint32_t phys_id) +{ + int cpu_id = ml_get_cpu_number(phys_id); + if (cpu_id < 0) { + return -1; + } + + ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id]; + + return cpu->cluster_id; +} + +unsigned int +ml_get_cpu_number_local(void) +{ + uint64_t mpidr_el1_value = 0; + unsigned cpu_id; + + /* We identify the CPU based on the constant bits of MPIDR_EL1. */ + MRS(mpidr_el1_value, "MPIDR_EL1"); + cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value); + + assert(cpu_id <= (unsigned int)ml_get_max_cpu_number()); + + return cpu_id; +} + +int +ml_get_cluster_number_local() +{ + uint64_t mpidr_el1_value = 0; + unsigned cluster_id; + + /* We identify the cluster based on the constant bits of MPIDR_EL1. */ + MRS(mpidr_el1_value, "MPIDR_EL1"); + cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value); + + assert(cluster_id <= (unsigned int)ml_get_max_cluster_number()); + + return cluster_id; +} + int ml_get_max_cpu_number(void) { - return max_cpu_number; + return topology_info.max_cpu_id; } +int +ml_get_max_cluster_number(void) +{ + return topology_info.max_cluster_id; +} + +unsigned int +ml_get_first_cpu_id(unsigned int cluster_id) +{ + return topology_info.clusters[cluster_id].first_cpu_id; +} void ml_lockdown_init() { - lockdown_handler_grp = lck_grp_alloc_init("lockdown_handler", NULL); - assert(lockdown_handler_grp != NULL); - - lck_mtx_init(&lockdown_handler_lck, lockdown_handler_grp, NULL); - -#if defined(KERNEL_INTEGRITY_CTRR) - init_ctrr_cpu_start_lock(); +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) + rorgn_stash_range(); #endif } kern_return_t ml_lockdown_handler_register(lockdown_handler_t f, void *this) { - if (lockdown_handler || !f) { + if (!f) { return KERN_FAILURE; } - lck_mtx_lock(&lockdown_handler_lck); - lockdown_handler = f; - lockdown_this = this; - -#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) - lockdown_done = 1; - lockdown_handler(this); -#else - if (lockdown_done) { - lockdown_handler(this); - } -#endif - lck_mtx_unlock(&lockdown_handler_lck); + assert(lockdown_done); + f(this); // XXX: f this whole function return KERN_SUCCESS; } -void -ml_lockdown_run_handler() -{ - lck_mtx_lock(&lockdown_handler_lck); - assert(!lockdown_done); - - lockdown_done = 1; - if (lockdown_handler) { - lockdown_handler(lockdown_this); - } - lck_mtx_unlock(&lockdown_handler_lck); -} - kern_return_t ml_processor_register(ml_processor_info_t *in_processor_info, processor_t *processor_out, ipi_handler_t *ipi_handler_out, @@ -1219,7 +1165,7 @@ ml_processor_register(ml_processor_info_t *in_processor_info, return KERN_FAILURE; } - if ((unsigned int)OSIncrementAtomic((SInt32*)®_cpu_count) >= avail_cpus) { + if ((unsigned)OSIncrementAtomic((SInt32*)®_cpu_count) >= topology_info.num_cpus) { return KERN_FAILURE; } @@ -1232,7 +1178,7 @@ ml_processor_register(ml_processor_info_t *in_processor_info, is_boot_cpu = TRUE; } - assert(in_processor_info->log_id < MAX_CPUS); + assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number()); this_cpu_datap->cpu_id = in_processor_info->cpu_id; @@ -1242,22 +1188,22 @@ ml_processor_register(ml_processor_info_t *in_processor_info, } if (!is_boot_cpu) { - this_cpu_datap->cpu_number = in_processor_info->log_id; + this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id); if (cpu_data_register(this_cpu_datap) != KERN_SUCCESS) { goto processor_register_error; } } - this_cpu_datap->cpu_idle_notify = (void *) in_processor_info->processor_idle; - this_cpu_datap->cpu_cache_dispatch = in_processor_info->platform_cache_dispatch; + this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle; + this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch; nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency); this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr); - this_cpu_datap->idle_timer_notify = (void *) in_processor_info->idle_timer; + this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer; this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon; - this_cpu_datap->platform_error_handler = (void *) in_processor_info->platform_error_handler; + this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler; this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr; this_cpu_datap->cpu_phys_id = in_processor_info->phys_id; this_cpu_datap->cpu_l2_access_penalty = in_processor_info->l2_access_penalty; @@ -1275,13 +1221,50 @@ ml_processor_register(ml_processor_info_t *in_processor_info, this_cpu_datap->cluster_master = is_boot_cpu; #endif /* HAS_CLUSTER */ +#if !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) + { + /* Workaround for the existing scheduler + * code, which only supports a limited number of psets. + * + * To get around that limitation, we distribute all cores into + * two psets according to their cluster type, instead of + * having a dedicated pset per cluster ID. + */ + + pset_cluster_type_t pset_cluster_type; + + /* For this workaround, we don't expect seeing anything else + * than E or P clusters. */ + switch (in_processor_info->cluster_type) { + case CLUSTER_TYPE_E: + pset_cluster_type = PSET_AMP_E; + break; + case CLUSTER_TYPE_P: + pset_cluster_type = PSET_AMP_P; + break; + default: + panic("unknown/unsupported cluster type %d", in_processor_info->cluster_type); + } + + pset = pset_find_first_by_cluster_type(pset_cluster_type); + + if (pset == NULL) { + panic("no pset for cluster type %d/%d", in_processor_info->cluster_type, pset_cluster_type); + } + + kprintf("%s>chosen pset with cluster id %d cluster type %d for core:\n", + __FUNCTION__, pset->pset_cluster_id, pset->pset_cluster_type); + } +#else /* !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) */ pset = pset_find(in_processor_info->cluster_id, processor_pset(master_processor)); +#endif /* !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) */ + assert(pset != NULL); kprintf("%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type); + processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap); if (!is_boot_cpu) { - processor_init((struct processor *)this_cpu_datap->cpu_processor, - this_cpu_datap->cpu_number, pset); + processor_init(processor, this_cpu_datap->cpu_number, pset); if (this_cpu_datap->cpu_l2_access_penalty) { /* @@ -1290,12 +1273,11 @@ ml_processor_register(ml_processor_info_t *in_processor_info, * scheduler, so that threads use the cores with better L2 * preferentially. */ - processor_set_primary(this_cpu_datap->cpu_processor, - master_processor); + processor_set_primary(processor, master_processor); } } - *processor_out = this_cpu_datap->cpu_processor; + *processor_out = processor; *ipi_handler_out = cpu_signal_handler; #if CPMU_AIC_PMI && MONOTONIC *pmi_handler_out = mt_cpmu_aic_pmi; @@ -1410,6 +1392,13 @@ ml_io_map_wcomb( return io_map(phys_addr, size, VM_WIMG_WCOMB); } +void +ml_io_unmap(vm_offset_t addr, vm_size_t sz) +{ + pmap_remove(kernel_pmap, addr, addr + sz); + kmem_free(kernel_map, addr, sz); +} + /* boot memory allocation */ vm_offset_t ml_static_malloc( @@ -1437,14 +1426,30 @@ vm_offset_t ml_static_slide( vm_offset_t vaddr) { - return phystokv(vaddr + vm_kernel_slide - gVirtBase + gPhysBase); + vm_offset_t slid_vaddr = vaddr + vm_kernel_slide; + + if ((slid_vaddr < vm_kernelcache_base) || (slid_vaddr >= vm_kernelcache_top)) { + /* This is only intended for use on kernelcache addresses. */ + return 0; + } + + /* + * Because the address is in the kernelcache, we can do a simple + * slide calculation. + */ + return slid_vaddr; } vm_offset_t ml_static_unslide( vm_offset_t vaddr) { - return ml_static_vtop(vaddr) - gPhysBase + gVirtBase - vm_kernel_slide; + if ((vaddr < vm_kernelcache_base) || (vaddr >= vm_kernelcache_top)) { + /* This is only intended for use on kernelcache addresses. */ + return 0; + } + + return vaddr - vm_kernel_slide; } extern tt_entry_t *arm_kva_to_tte(vm_offset_t va); @@ -1471,6 +1476,9 @@ ml_static_protect( if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) { panic("ml_static_protect(): WX request on %p", (void *) vaddr); } + if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) { + panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr); + } /* Set up the protection bits, and block bits so we can validate block mappings. */ if (new_prot & VM_PROT_WRITE) { @@ -1499,8 +1507,8 @@ ml_static_protect( pt_entry_t ptmp; #if XNU_MONITOR - assert(!TEST_PAGE_RATIO_4); assert(!pmap_is_monitor(ppn)); + assert(!TEST_PAGE_RATIO_4); #endif tte2 = arm_kva_to_tte(vaddr_cur); @@ -1552,7 +1560,6 @@ ml_static_protect( } } else { ptmp = *pte_p; - /* We only need to update the page tables if the protections do not match. */ if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) { ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot; @@ -1579,11 +1586,25 @@ ml_static_protect( void ml_static_mfree( vm_offset_t vaddr, - vm_size_t size) + vm_size_t size) { - vm_offset_t vaddr_cur; - ppnum_t ppn; - uint32_t freed_pages = 0; + vm_offset_t vaddr_cur; + ppnum_t ppn; + uint32_t freed_pages = 0; + uint32_t bad_page_cnt = 0; + uint32_t freed_kernelcache_pages = 0; + +#if defined(__arm64__) && (DEVELOPMENT || DEBUG) + /* For testing hitting a bad ram page */ + static int count = 0; + static int bad_at_cnt = -1; + static bool first = true; + + if (first) { + (void)PE_parse_boot_argn("bad_static_mfree", &bad_at_cnt, sizeof(bad_at_cnt)); + first = false; + } +#endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */ /* It is acceptable (if bad) to fail to free. */ if (vaddr < VM_MIN_KERNEL_ADDRESS) { @@ -1607,24 +1628,33 @@ ml_static_mfree( panic("Failed ml_static_mfree on %p", (void *) vaddr_cur); } -#if 0 - /* - * Must NOT tear down the "V==P" mapping for vaddr_cur as the zone alias scheme - * relies on the persistence of these mappings for all time. - */ - // pmap_remove(kernel_pmap, (addr64_t) vaddr_cur, (addr64_t) (vaddr_cur + PAGE_SIZE)); -#endif +#if defined(__arm64__) + bool is_bad = pmap_is_bad_ram(ppn); +#if DEVELOPMENT || DEBUG + is_bad |= (count++ == bad_at_cnt); +#endif /* DEVELOPMENT || DEBUG */ + + if (is_bad) { + ++bad_page_cnt; + vm_page_create_retired(ppn); + continue; + } +#endif /* defined(__arm64__) */ vm_page_create(ppn, (ppn + 1)); freed_pages++; + if (vaddr_cur >= segLOWEST && vaddr_cur < end_kern) { + freed_kernelcache_pages++; + } } } vm_page_lockspin_queues(); vm_page_wire_count -= freed_pages; vm_page_wire_count_initial -= freed_pages; + vm_page_kernelcache_count -= freed_kernelcache_pages; vm_page_unlock_queues(); #if DEBUG - kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn); + kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt); #endif } @@ -1822,9 +1852,9 @@ ml_set_decrementer(uint32_t dec_value) cdp->cpu_decrementer = dec_value; if (cdp->cpu_set_decrementer_func) { - ((void (*)(uint32_t))cdp->cpu_set_decrementer_func)(dec_value); + cdp->cpu_set_decrementer_func(dec_value); } else { - __asm__ volatile ("msr CNTP_TVAL_EL0, %0" : : "r"((uint64_t)dec_value)); + __builtin_arm_wsr64("CNTV_TVAL_EL0", (uint64_t)dec_value); } } @@ -1834,10 +1864,10 @@ ml_get_hwclock() uint64_t timebase; // ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2 - // "Reads of CNTPCT[_EL0] can occur speculatively and out of order relative + // "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative // to other instructions executed on the same processor." __builtin_arm_isb(ISB_SY); - timebase = __builtin_arm_rsr64("CNTPCT_EL0"); + timebase = __builtin_arm_rsr64("CNTVCT_EL0"); return timebase; } @@ -1848,6 +1878,25 @@ ml_get_timebase() return ml_get_hwclock() + getCpuDatap()->cpu_base_timebase; } +/* + * Get the speculative timebase without an ISB. + */ +uint64_t +ml_get_speculative_timebase() +{ + uint64_t timebase; + + timebase = __builtin_arm_rsr64("CNTVCT_EL0"); + + return timebase + getCpuDatap()->cpu_base_timebase; +} + +uint64_t +ml_get_timebase_entropy(void) +{ + return ml_get_speculative_timebase(); +} + uint32_t ml_get_decrementer() { @@ -1857,11 +1906,11 @@ ml_get_decrementer() assert(ml_get_interrupts_enabled() == FALSE); if (cdp->cpu_get_decrementer_func) { - dec = ((uint32_t (*)(void))cdp->cpu_get_decrementer_func)(); + dec = cdp->cpu_get_decrementer_func(); } else { uint64_t wide_val; - __asm__ volatile ("mrs %0, CNTP_TVAL_EL0" : "=r"(wide_val)); + wide_val = __builtin_arm_rsr64("CNTV_TVAL_EL0"); dec = (uint32_t)wide_val; assert(wide_val == (uint64_t)dec); } @@ -1872,24 +1921,8 @@ ml_get_decrementer() boolean_t ml_get_timer_pending() { - uint64_t cntp_ctl; - - __asm__ volatile ("mrs %0, CNTP_CTL_EL0" : "=r"(cntp_ctl)); - return ((cntp_ctl & CNTP_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE; -} - -boolean_t -ml_wants_panic_trap_to_debugger(void) -{ - boolean_t result = FALSE; -#if XNU_MONITOR - /* - * This looks racey, but if we are in the PPL, preemption will be - * disabled. - */ - result = ((pmap_get_cpu_data()->ppl_state == PPL_STATE_DISPATCH) && pmap_ppl_locked_down); -#endif - return result; + uint64_t cntv_ctl = __builtin_arm_rsr64("CNTV_CTL_EL0"); + return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE; } static void @@ -1907,7 +1940,7 @@ cache_trap_error(thread_t thread, vm_map_address_t fault_addr) } static void -cache_trap_recover() +cache_trap_recover(void) { vm_map_address_t fault_addr; @@ -1920,7 +1953,8 @@ static void set_cache_trap_recover(thread_t thread) { #if defined(HAS_APPLE_PAC) - thread->recover = (vm_address_t)ptrauth_auth_and_resign(&cache_trap_recover, + void *fun = &cache_trap_recover; + thread->recover = (vm_address_t)ptrauth_auth_and_resign(fun, ptrauth_key_function_pointer, 0, ptrauth_key_function_pointer, ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER)); #else /* defined(HAS_APPLE_PAC) */ @@ -2053,13 +2087,13 @@ _enable_timebase_event_stream(uint32_t bit_index) /* * If the SOC supports it (and it isn't broken), enable - * EL0 access to the physical timebase register. + * EL0 access to the timebase registers. */ if (user_timebase_type() != USER_TIMEBASE_NONE) { - cntkctl |= CNTKCTL_EL1_PL0PCTEN; + cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN); } - __asm__ volatile ("msr CNTKCTL_EL1, %0" : : "r"(cntkctl)); + __builtin_arm_wsr64("CNTKCTL_EL1", cntkctl); } /* @@ -2068,31 +2102,48 @@ _enable_timebase_event_stream(uint32_t bit_index) static void _enable_virtual_timer(void) { - uint64_t cntvctl = CNTP_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */ + uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */ - __asm__ volatile ("msr CNTP_CTL_EL0, %0" : : "r"(cntvctl)); + __builtin_arm_wsr64("CNTV_CTL_EL0", cntvctl); + /* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */ + __builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED); } -uint64_t events_per_sec = 0; - void fiq_context_init(boolean_t enable_fiq __unused) { - _enable_timebase_event_stream(fiq_eventi); - /* Interrupts still disabled. */ assert(ml_get_interrupts_enabled() == FALSE); _enable_virtual_timer(); } void -fiq_context_bootstrap(boolean_t enable_fiq) +wfe_timeout_init(void) +{ + _enable_timebase_event_stream(arm64_eventi); +} + +void +wfe_timeout_configure(void) { -#if defined(APPLE_ARM64_ARCH_FAMILY) || defined(BCM2837) /* Could fill in our own ops here, if we needed them */ - uint64_t ticks_per_sec, ticks_per_event; + uint64_t ticks_per_sec, ticks_per_event, events_per_sec = 0; uint32_t bit_index; + if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) { + if (events_per_sec <= 0) { + events_per_sec = 1; + } else if (events_per_sec > USEC_PER_SEC) { + events_per_sec = USEC_PER_SEC; + } + } else { +#if defined(ARM_BOARD_WFE_TIMEOUT_NS) + events_per_sec = NSEC_PER_SEC / ARM_BOARD_WFE_TIMEOUT_NS; +#else /* !defined(ARM_BOARD_WFE_TIMEOUT_NS) */ + /* Default to 1usec (or as close as we can get) */ + events_per_sec = USEC_PER_SEC; +#endif /* !defined(ARM_BOARD_WFE_TIMEOUT_NS) */ + } ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz; ticks_per_event = ticks_per_sec / events_per_sec; bit_index = flsll(ticks_per_event) - 1; /* Highest bit set */ @@ -2112,11 +2163,8 @@ fiq_context_bootstrap(boolean_t enable_fiq) bit_index--; } - fiq_eventi = bit_index; -#else -#error Need a board configuration. -#endif - fiq_context_init(enable_fiq); + arm64_eventi = bit_index; + wfe_timeout_init(); } boolean_t @@ -2172,13 +2220,11 @@ ml_energy_stat(thread_t t) void ml_gpu_stat_update(__unused uint64_t gpu_ns_delta) { -#if CONFIG_EMBEDDED /* * For now: update the resource coalition stats of the * current thread's coalition */ task_coalition_update_gpu_stats(current_task(), gpu_ns_delta); -#endif } uint64_t @@ -2187,7 +2233,8 @@ ml_gpu_stat(__unused thread_t t) return 0; } -#if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME +#if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME || HAS_FAST_CNTVCT + static void timer_state_event(boolean_t switch_to_kernel) { @@ -2196,8 +2243,8 @@ timer_state_event(boolean_t switch_to_kernel) return; } - processor_data_t *pd = &getCpuDatap()->cpu_processor->processor_data; - uint64_t now = ml_get_timebase(); + processor_t pd = current_processor(); + uint64_t now = ml_get_speculative_timebase(); timer_stop(pd->current_state, now); pd->current_state = (switch_to_kernel) ? &pd->system_state : &pd->user_state; @@ -2219,7 +2266,7 @@ timer_state_event_kernel_to_user(void) { timer_state_event(FALSE); } -#endif /* !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME */ +#endif /* !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME || HAS_FAST_CNTVCT */ /* * The following are required for parts of the kernel @@ -2296,14 +2343,14 @@ ex_cb_invoke( #if defined(HAS_APPLE_PAC) void -ml_task_set_disable_user_jop(task_t task, boolean_t disable_user_jop) +ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop) { assert(task); task->disable_user_jop = disable_user_jop; } void -ml_thread_set_disable_user_jop(thread_t thread, boolean_t disable_user_jop) +ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop) { assert(thread); thread->machine.disable_user_jop = disable_user_jop; @@ -2318,35 +2365,180 @@ ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit) task->rop_pid = early_random(); } } -#endif /* defined(HAS_APPLE_PAC) */ +/** + * jop_pid may be inherited from the parent task or generated inside the shared + * region. Unfortunately these two parameters are available at very different + * times during task creation, so we need to split this into two steps. + */ +void +ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit) +{ + if (inherit) { + task->jop_pid = parent_task->jop_pid; + } else { + task->jop_pid = ml_default_jop_pid(); + } +} + +void +ml_task_set_jop_pid_from_shared_region(task_t task) +{ + vm_shared_region_t sr = vm_shared_region_get(task); + /* + * If there's no shared region, we can assign the key arbitrarily. This + * typically happens when Mach-O image activation failed part of the way + * through, and this task is in the middle of dying with SIGKILL anyway. + */ + if (__improbable(!sr)) { + task->jop_pid = early_random(); + return; + } + vm_shared_region_deallocate(sr); + + /* + * Similarly we have to worry about jetsam having killed the task and + * already cleared the shared_region_id. + */ + task_lock(task); + if (task->shared_region_id != NULL) { + task->jop_pid = shared_region_find_key(task->shared_region_id); + } else { + task->jop_pid = early_random(); + } + task_unlock(task); +} + +void +ml_thread_set_jop_pid(thread_t thread, task_t task) +{ + thread->machine.jop_pid = task->jop_pid; +} +#endif /* defined(HAS_APPLE_PAC) */ #if defined(HAS_APPLE_PAC) +#define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \ + asm volatile ("aut" #_suffix " %[ptr], %[modifier]" : [ptr] "+r"(_ptr) : [modifier] "r"(_modifier)); /* * ml_auth_ptr_unchecked: call this instead of ptrauth_auth_data * instrinsic when you don't want to trap on auth fail. * */ - void * ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier) { switch (key & 0x3) { case ptrauth_key_asia: - asm volatile ("autia %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier)); + _ml_auth_ptr_unchecked(ptr, ia, modifier); break; case ptrauth_key_asib: - asm volatile ("autib %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier)); + _ml_auth_ptr_unchecked(ptr, ib, modifier); break; case ptrauth_key_asda: - asm volatile ("autda %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier)); + _ml_auth_ptr_unchecked(ptr, da, modifier); break; case ptrauth_key_asdb: - asm volatile ("autdb %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier)); + _ml_auth_ptr_unchecked(ptr, db, modifier); break; } return ptr; } #endif /* defined(HAS_APPLE_PAC) */ + +#ifdef CONFIG_XNUPOST +void +ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr) +{ + thread_t thread = current_thread(); + thread->machine.expected_fault_handler = expected_fault_handler; + thread->machine.expected_fault_addr = expected_fault_addr; +} + +void +ml_expect_fault_end(void) +{ + thread_t thread = current_thread(); + thread->machine.expected_fault_handler = NULL; + thread->machine.expected_fault_addr = 0; +} +#endif /* CONFIG_XNUPOST */ + +void +ml_hibernate_active_pre(void) +{ +#if HIBERNATION + if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) { + + hibernate_rebuild_vm_structs(); + } +#endif /* HIBERNATION */ +} + +void +ml_hibernate_active_post(void) +{ +#if HIBERNATION + if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) { + hibernate_machine_init(); + hibernate_vm_lock_end(); + current_cpu_datap()->cpu_hibernate = 0; + } +#endif /* HIBERNATION */ +} + +/** + * Return back a machine-dependent array of address space regions that should be + * reserved by the VM (pre-mapped in the address space). This will prevent user + * processes from allocating or deallocating from within these regions. + * + * @param vm_is64bit True if the process has a 64-bit address space. + * @param regions An out parameter representing an array of regions to reserve. + * + * @return The number of reserved regions returned through `regions`. + */ +size_t +ml_get_vm_reserved_regions(bool vm_is64bit, struct vm_reserved_region **regions) +{ + assert(regions != NULL); + + /** + * Reserved regions only apply to 64-bit address spaces. This is because + * we only expect to grow the maximum user VA address on 64-bit address spaces + * (we've essentially already reached the max for 32-bit spaces). The reserved + * regions should safely fall outside of the max user VA for 32-bit processes. + */ + if (vm_is64bit) { + *regions = vm_reserved_regions; + return ARRAY_COUNT(vm_reserved_regions); + } else { + /* Don't reserve any VA regions on arm64_32 processes. */ + *regions = NULL; + return 0; + } +} +/* These WFE recommendations are expected to be updated on a relatively + * infrequent cadence, possibly from a different cluster, hence + * false cacheline sharing isn't expected to be material + */ +static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS]; + +uint32_t +ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags) +{ + assert(wfe_cluster_id < MAX_CPU_CLUSTERS); + assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval); + os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed); + return 0; /* Success */ +} + +uint64_t +ml_cluster_wfe_timeout(uint32_t wfe_cluster_id) +{ + /* This and its consumer does not synchronize vis-a-vis updates + * of the recommendation; races are acceptable. + */ + uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed); + return wfet; +}