xnu-7195.101.1.tar.gz

[apple/xnu.git] / osfmk / arm64 / machine_routines.c
diff --git a/osfmk/arm64/machine_routines.c b/osfmk/arm64/machine_routines.c

index 037f34c135fad0fccb42360b36a737a20397e199..3b616122b516a2bd9788da73046ccc1c04393c99 100644 (file)
--- a/osfmk/arm64/machine_routines.c
+++ b/osfmk/arm64/machine_routines.c
@@ -42,83 +42,133 @@
  #include <arm/cpu_capabilities.h>
  #include <console/serial_protos.h>
  #include <kern/machine.h>
+#include <kern/misc_protos.h>
  #include <prng/random.h>
  #include <kern/startup.h>
  #include <kern/thread.h>
  #include <kern/timer_queue.h>
  #include <mach/machine.h>
  #include <machine/atomic.h>
+#include <machine/config.h>
  #include <vm/pmap.h>
  #include <vm/vm_page.h>
+#include <vm/vm_shared_region.h>
+#include <vm/vm_map.h>
+#include <sys/codesign.h>
  #include <sys/kdebug.h>
  #include <kern/coalition.h>
  #include <pexpert/device_tree.h>
  
  #include <IOKit/IOPlatformExpert.h>
+#if HIBERNATION
+#include <IOKit/IOHibernatePrivate.h>
+#endif /* HIBERNATION */
  
  #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
-#include <libkern/kernel_mach_header.h>
+#include <arm64/amcc_rorgn.h>
  #endif
  
  #include <libkern/section_keywords.h>
  
+/**
+ * On supported hardware, debuggable builds make the HID bits read-only
+ * without locking them.  This lets people manually modify HID bits while
+ * debugging, since they can use a debugging tool to first reset the HID
+ * bits back to read/write.  However it will still catch xnu changes that
+ * accidentally write to HID bits after they've been made read-only.
+ */
+#if HAS_TWO_STAGE_SPR_LOCK && !(DEVELOPMENT || DEBUG)
+#define USE_TWO_STAGE_SPR_LOCK
+#endif
+
  #if KPC
  #include <kern/kpc.h>
  #endif
  
+#define MPIDR_CPU_ID(mpidr_el1_val)             (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
+#define MPIDR_CLUSTER_ID(mpidr_el1_val)         (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
+
  #if HAS_CLUSTER
  static uint8_t cluster_initialized = 0;
  #endif
  
-
-static int max_cpus_initialized = 0;
-#define MAX_CPUS_SET    0x1
-#define MAX_CPUS_WAIT   0x2
-
  uint32_t LockTimeOut;
  uint32_t LockTimeOutUsec;
  uint64_t TLockTimeOut;
  uint64_t MutexSpin;
-boolean_t is_clock_configured = FALSE;
+uint64_t low_MutexSpin;
+int64_t high_MutexSpin;
  
-uint32_t yield_delay_us = 0; /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
+static uint64_t ml_wfe_hint_max_interval;
+#define MAX_WFE_HINT_INTERVAL_US (500ULL)
  
-#if CONFIG_NONFATAL_ASSERTS
-extern int mach_assert;
-#endif
-extern volatile uint32_t debug_enabled;
+/* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
+TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
  
  extern vm_offset_t   segLOWEST;
  extern vm_offset_t   segLOWESTTEXT;
  extern vm_offset_t   segLASTB;
  extern unsigned long segSizeLAST;
  
+/* ARM64 specific bounds; used to test for presence in the kernelcache. */
+extern vm_offset_t   vm_kernelcache_base;
+extern vm_offset_t   vm_kernelcache_top;
+
  #if defined(HAS_IPI)
  unsigned int gFastIPI = 1;
  #define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
-static uint64_t deferred_ipi_timer_ns = kDeferredIPITimerDefault;
+static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
+    kDeferredIPITimerDefault);
  #endif /* defined(HAS_IPI) */
  
-void machine_conf(void);
-
  thread_t Idle_context(void);
  
-SECURITY_READ_ONLY_LATE(static uint32_t) cpu_phys_ids[MAX_CPUS] = {[0 ... MAX_CPUS - 1] = (uint32_t)-1};
-SECURITY_READ_ONLY_LATE(static unsigned int) avail_cpus = 0;
-SECURITY_READ_ONLY_LATE(static int) boot_cpu = -1;
-SECURITY_READ_ONLY_LATE(static int) max_cpu_number = 0;
-SECURITY_READ_ONLY_LATE(cluster_type_t) boot_cluster = CLUSTER_TYPE_SMP;
+SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
+SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
+SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
+       .version = CPU_TOPOLOGY_VERSION,
+       .cpus = topology_cpu_array,
+       .clusters = topology_cluster_array,
+};
+/**
+ * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
+ * entries of an arbitrary data type.  This is intended for use by specialized consumers
+ * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
+ * as follows:
+ *     hypothetical_array[cluster_offsets[AFF1] + AFF0]
+ * Most consumers should instead use general-purpose facilities such as PERCPU or
+ * ml_get_cpu_number().
+ */
+SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
+
+SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
  
-SECURITY_READ_ONLY_LATE(static uint32_t) fiq_eventi = UINT32_MAX;
+extern uint32_t lockdown_done;
  
-lockdown_handler_t lockdown_handler;
-void *lockdown_this;
-lck_mtx_t lockdown_handler_lck;
-lck_grp_t *lockdown_handler_grp;
-int lockdown_done;
+/**
+ * Represents regions of virtual address space that should be reserved
+ * (pre-mapped) in each user address space.
+ */
+SECURITY_READ_ONLY_LATE(static struct vm_reserved_region) vm_reserved_regions[] = {
+       {
+               .vmrr_name = "GPU Carveout",
+               .vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
+               .vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
+       },
+       /*
+        * Reserve the virtual memory space representing the commpage nesting region
+        * to prevent user processes from allocating memory within it. The actual
+        * page table entries for the commpage are inserted by vm_commpage_enter().
+        * This vm_map_enter() just prevents userspace from allocating/deallocating
+        * anything within the entire commpage nested region.
+        */
+       {
+               .vmrr_name = "commpage nesting",
+               .vmrr_addr = _COMM_PAGE64_NESTING_START,
+               .vmrr_size = _COMM_PAGE64_NESTING_SIZE
+       }
+};
  
-void ml_lockdown_init(void);
-void ml_lockdown_run_handler(void);
  uint32_t get_arm_cpu_version(void);
  
  #if defined(HAS_IPI)
@@ -132,17 +182,17 @@ ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
          * to a single CPU.  Otherwise we may migrate between choosing which
          * IPI mechanism to use and issuing the IPI. */
         MRS(local_mpidr, "MPIDR_EL1");
-       if ((local_mpidr & MPIDR_AFF1_MASK) == (cpu_mpidr & MPIDR_AFF1_MASK)) {
-               uint64_t x = type | (cpu_mpidr & MPIDR_AFF0_MASK);
-               MSR(ARM64_REG_IPI_RR_LOCAL, x);
+       if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
+               uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
+               MSR("S3_5_C15_C0_0", x);
         } else {
                 #define IPI_RR_TARGET_CLUSTER_SHIFT 16
-               uint64_t x = type | ((cpu_mpidr & MPIDR_AFF1_MASK) << (IPI_RR_TARGET_CLUSTER_SHIFT - MPIDR_AFF1_SHIFT)) | (cpu_mpidr & MPIDR_AFF0_MASK);
-               MSR(ARM64_REG_IPI_RR_GLOBAL, x);
+               uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
+               MSR("S3_5_C15_C0_1", x);
         }
  #else
-       uint64_t x = type | (cpu_mpidr & MPIDR_AFF0_MASK);
-       MSR(ARM64_REG_IPI_RR, x);
+       uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
+       MSR("S3_5_C15_C0_1", x);
  #endif
  }
  #endif
@@ -186,7 +236,7 @@ ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
         /* update deferred_ipi_timer_ns with the new clamped value */
         absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
  
-       MSR(ARM64_REG_IPI_CR, abstime);
+       MSR("S3_5_C15_C3_1", abstime);
  #else
         (void)nanosecs;
         panic("Platform does not support ACC Fast IPI");
@@ -232,23 +282,14 @@ ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
  void
  machine_idle(void)
  {
-       __builtin_arm_wsr("DAIFSet", (DAIFSC_IRQF | DAIFSC_FIQF));
+       /* Interrupts are expected to be masked on entry or re-entry via
+        * Idle_load_context()
+        */
+       assert((__builtin_arm_rsr("DAIF") & DAIF_IRQF) == DAIF_IRQF);
         Idle_context();
         __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
  }
  
-void
-init_vfp(void)
-{
-       return;
-}
-
-boolean_t
-get_vfp_enabled(void)
-{
-       return TRUE;
-}
-
  void
  OSSynchronizeIO(void)
  {
@@ -312,10 +353,21 @@ get_arm_cpu_version(void)
         return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
  }
  
+bool
+ml_feature_supported(uint32_t feature_bit)
+{
+       uint64_t aidr_el1_value = 0;
+
+       MRS(aidr_el1_value, "AIDR_EL1");
+
+
+       return aidr_el1_value & feature_bit;
+}
+
  /*
   * user_cont_hwclock_allowed()
   *
- * Indicates whether we allow EL0 to read the physical timebase (CNTPCT_EL0)
+ * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
   * as a continuous time source (e.g. from mach_continuous_time)
   */
  boolean_t
@@ -335,331 +387,15 @@ user_timebase_type(void)
         return USER_TIMEBASE_SPEC;
  }
  
-boolean_t
-arm64_wfe_allowed(void)
-{
-       return TRUE;
-}
-
-#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
-
-uint64_t rorgn_begin __attribute__((section("__DATA, __const"))) = 0;
-uint64_t rorgn_end   __attribute__((section("__DATA, __const"))) = 0;
-vm_offset_t amcc_base;
-
-static void assert_unlocked(void);
-static void assert_amcc_cache_disabled(void);
-static void lock_amcc(void);
-static void lock_mmu(uint64_t begin, uint64_t end);
-
-void
-rorgn_stash_range(void)
-{
-#if DEVELOPMENT || DEBUG
-       boolean_t rorgn_disable = FALSE;
-
-       PE_parse_boot_argn("-unsafe_kernel_text", &rorgn_disable, sizeof(rorgn_disable));
-
-       if (rorgn_disable) {
-               /* take early out if boot arg present, don't query any machine registers to avoid
-                * dependency on amcc DT entry
-                */
-               return;
-       }
-#endif
-
-       /* Get the AMC values, and stash them into rorgn_begin, rorgn_end.
-        * gPhysBase is the base of DRAM managed by xnu. we need DRAM_BASE as
-        * the AMCC RO region begin/end registers are in units of 16KB page
-        * numbers from DRAM_BASE so we'll truncate gPhysBase at 512MB granule
-        * and assert the value is the canonical DRAM_BASE PA of 0x8_0000_0000 for arm64.
-        */
-
-       uint64_t dram_base = gPhysBase & ~0x1FFFFFFFULL;  /* 512MB */
-       assert(dram_base == 0x800000000ULL);
-
-#if defined(KERNEL_INTEGRITY_KTRR)
-       uint64_t soc_base = 0;
-       DTEntry entryP = NULL;
-       uintptr_t *reg_prop = NULL;
-       uint32_t prop_size = 0;
-       int rc;
-
-       soc_base = pe_arm_get_soc_base_phys();
-       rc = DTFindEntry("name", "mcc", &entryP);
-       assert(rc == kSuccess);
-       rc = DTGetProperty(entryP, "reg", (void **)&reg_prop, &prop_size);
-       assert(rc == kSuccess);
-       amcc_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1));
-#elif defined(KERNEL_INTEGRITY_CTRR)
-       /* TODO: t8020 mcc entry not in device tree yet; we'll do it LIVE */
-#define TEMP_AMCC_BASE_PA 0x200000000ULL
-#define TEMP_AMCC_SZ      0x100000
-       amcc_base = ml_io_map(TEMP_AMCC_BASE_PA, TEMP_AMCC_SZ);
-#else
-#error "KERNEL_INTEGRITY config error"
-#endif
-
-#if defined(KERNEL_INTEGRITY_KTRR)
-       assert(rRORGNENDADDR > rRORGNBASEADDR);
-       rorgn_begin = (rRORGNBASEADDR << AMCC_PGSHIFT) + dram_base;
-       rorgn_end   = (rRORGNENDADDR << AMCC_PGSHIFT) + dram_base;
-#elif defined(KERNEL_INTEGRITY_CTRR)
-       rorgn_begin = rCTRR_AMCC_PLANE_REG(0, CTRR_A_BASEADDR);
-       rorgn_end   = rCTRR_AMCC_PLANE_REG(0, CTRR_A_ENDADDR);
-       assert(rorgn_end > rorgn_begin);
-
-       for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) {
-               uint32_t begin = rCTRR_AMCC_PLANE_REG(i, CTRR_A_BASEADDR);
-               uint32_t end = rCTRR_AMCC_PLANE_REG(i, CTRR_A_ENDADDR);
-               if (!(begin == rorgn_begin && end == rorgn_end)) {
-#if DEVELOPMENT || DEBUG
-                       panic("iboot programmed CTRR bounds are inconsistent");
-#else
-                       panic("Inconsistent memory configuration");
-#endif
-               }
-       }
-
-       // convert from page number from DRAM base to PA
-       rorgn_begin = (rorgn_begin << AMCC_PGSHIFT) + dram_base;
-       rorgn_end   = (rorgn_end << AMCC_PGSHIFT) + dram_base;
-
-#else
-#error KERNEL_INTEGRITY config error
-#endif /* defined (KERNEL_INTEGRITY_KTRR) */
-}
-
-static void
-assert_unlocked()
-{
-       uint64_t ktrr_lock = 0;
-       uint32_t rorgn_lock = 0;
-
-       assert(amcc_base);
-#if defined(KERNEL_INTEGRITY_KTRR)
-       rorgn_lock = rRORGNLOCK;
-       ktrr_lock = __builtin_arm_rsr64(ARM64_REG_KTRR_LOCK_EL1);
-#elif defined(KERNEL_INTEGRITY_CTRR)
-       for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) {
-               rorgn_lock |= rCTRR_AMCC_PLANE_REG(i, CTRR_A_LOCK);
-       }
-       ktrr_lock = __builtin_arm_rsr64(ARM64_REG_CTRR_LOCK_EL1);
-#else
-#error KERNEL_INTEGRITY config error
-#endif /* defined(KERNEL_INTEGRITY_KTRR) */
-
-       assert(!ktrr_lock);
-       assert(!rorgn_lock);
-}
-
-static void
-lock_amcc()
-{
-#if defined(KERNEL_INTEGRITY_KTRR)
-       rRORGNLOCK = 1;
-       __builtin_arm_isb(ISB_SY);
-#elif defined(KERNEL_INTEGRITY_CTRR)
-       /* lockdown planes in reverse order as plane 0 should be locked last */
-       for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) {
-               rCTRR_AMCC_PLANE_REG(CTRR_AMCC_MAX_PLANES - i - 1, CTRR_A_ENABLE) = 1;
-               rCTRR_AMCC_PLANE_REG(CTRR_AMCC_MAX_PLANES - i - 1, CTRR_A_LOCK) = 1;
-               __builtin_arm_isb(ISB_SY);
-       }
-#else
-#error KERNEL_INTEGRITY config error
-#endif
-}
-
-static void
-lock_mmu(uint64_t begin, uint64_t end)
-{
-#if defined(KERNEL_INTEGRITY_KTRR)
-
-       __builtin_arm_wsr64(ARM64_REG_KTRR_LOWER_EL1, begin);
-       __builtin_arm_wsr64(ARM64_REG_KTRR_UPPER_EL1, end);
-       __builtin_arm_wsr64(ARM64_REG_KTRR_LOCK_EL1, 1ULL);
-
-       /* flush TLB */
-
-       __builtin_arm_isb(ISB_SY);
-       flush_mmu_tlb();
-
-#elif defined (KERNEL_INTEGRITY_CTRR)
-       /* this will lock the entire bootstrap cluster. non bootstrap clusters
-        * will be locked by respective cluster master in start.s */
-
-       __builtin_arm_wsr64(ARM64_REG_CTRR_A_LWR_EL1, begin);
-       __builtin_arm_wsr64(ARM64_REG_CTRR_A_UPR_EL1, end);
-
-#if !defined(APPLEVORTEX)
-       /* H12 changed sequence, must invalidate TLB immediately after setting CTRR bounds */
-       __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */
-       flush_mmu_tlb();
-#endif /* !defined(APPLEVORTEX) */
-
-       __builtin_arm_wsr64(ARM64_REG_CTRR_CTL_EL1, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT);
-       __builtin_arm_wsr64(ARM64_REG_CTRR_LOCK_EL1, 1ULL);
-
-       uint64_t current_el = __builtin_arm_rsr64("CurrentEL");
-       if (current_el == PSR64_MODE_EL2) {
-               // CTRR v2 has explicit registers for cluster config. they can only be written in EL2
-
-               __builtin_arm_wsr64(ACC_CTRR_A_LWR_EL2, begin);
-               __builtin_arm_wsr64(ACC_CTRR_A_UPR_EL2, end);
-               __builtin_arm_wsr64(ACC_CTRR_CTL_EL2, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT);
-               __builtin_arm_wsr64(ACC_CTRR_LOCK_EL2, 1ULL);
-       }
-
-       __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */
-#if defined(APPLEVORTEX)
-       flush_mmu_tlb();
-#endif /* defined(APPLEVORTEX) */
-
-#else /* defined(KERNEL_INTEGRITY_KTRR) */
-#error KERNEL_INTEGRITY config error
-#endif /* defined(KERNEL_INTEGRITY_KTRR) */
-}
-
-static void
-assert_amcc_cache_disabled()
-{
-#if defined(KERNEL_INTEGRITY_KTRR)
-       assert((rMCCGEN & 1) == 0); /* assert M$ disabled or LLC clean will be unreliable */
-#elif defined(KERNEL_INTEGRITY_CTRR) && (defined(ARM64_BOARD_CONFIG_T8006))
-       /*
-        * T8006 differentiates between data and tag ways being powered up, so
-        * make sure to check that both are zero on its single memory plane.
-        */
-       assert((rCTRR_AMCC_PLANE_REG(0, CTRR_AMCC_PWRONWAYCNTSTATUS) &
-           (AMCC_CURTAGWAYCNT_MASK | AMCC_CURDATWAYCNT_MASK)) == 0);
-#elif defined (KERNEL_INTEGRITY_CTRR)
-       for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) {
-               assert(rCTRR_AMCC_PLANE_REG(i, CTRR_AMCC_WAYONCNT) == 0);
-       }
-#else
-#error KERNEL_INTEGRITY config error
-#endif
-}
-
-/*
- * void rorgn_lockdown(void)
- *
- * Lock the MMU and AMCC RORegion within lower and upper boundaries if not already locked
- *
- * [ ] - ensure this is being called ASAP on secondary CPUs: KTRR programming and lockdown handled in
- *       start.s:start_cpu() for subsequent wake/resume of all cores
- */
-void
-rorgn_lockdown(void)
-{
-       vm_offset_t ktrr_begin, ktrr_end;
-       unsigned long last_segsz;
-
-#if DEVELOPMENT || DEBUG
-       boolean_t ktrr_disable = FALSE;
-
-       PE_parse_boot_argn("-unsafe_kernel_text", &ktrr_disable, sizeof(ktrr_disable));
-
-       if (ktrr_disable) {
-               /*
-                * take early out if boot arg present, since we may not have amcc DT entry present
-                * we can't assert that iboot hasn't programmed the RO region lockdown registers
-                */
-               goto out;
-       }
-#endif /* DEVELOPMENT || DEBUG */
-
-       assert_unlocked();
-
-       /* [x] - Use final method of determining all kernel text range or expect crashes */
-       ktrr_begin = segLOWEST;
-       assert(ktrr_begin && gVirtBase && gPhysBase);
-
-       ktrr_begin = kvtophys(ktrr_begin);
-
-       ktrr_end   = kvtophys(segLASTB);
-       last_segsz = segSizeLAST;
-#if defined(KERNEL_INTEGRITY_KTRR)
-       /* __LAST is not part of the MMU KTRR region (it is however part of the AMCC KTRR region) */
-       ktrr_end = (ktrr_end - 1) & ~AMCC_PGMASK;
-       /* ensure that iboot and xnu agree on the ktrr range */
-       assert(rorgn_begin == ktrr_begin && rorgn_end == (ktrr_end + last_segsz));
-       /* assert that __LAST segment containing privileged insns is only a single page */
-       assert(last_segsz == PAGE_SIZE);
-#elif defined(KERNEL_INTEGRITY_CTRR)
-       ktrr_end = (ktrr_end + last_segsz - 1) & ~AMCC_PGMASK;
-       /* __LAST is part of MMU CTRR region. Can't use the KTRR style method of making
-        * __pinst no execute because PXN applies with MMU off in CTRR. */
-       assert(rorgn_begin == ktrr_begin && rorgn_end == ktrr_end);
-#endif
-
-
-#if DEBUG || DEVELOPMENT
-       printf("KTRR Begin: %p End: %p, setting lockdown\n", (void *)ktrr_begin, (void *)ktrr_end);
-#endif
-
-       /* [x] - ensure all in flight writes are flushed to AMCC before enabling RO Region Lock */
-
-       assert_amcc_cache_disabled();
-
-       CleanPoC_DcacheRegion_Force(phystokv(ktrr_begin),
-           (unsigned)((ktrr_end + last_segsz) - ktrr_begin + AMCC_PGMASK));
-
-       lock_amcc();
-
-       lock_mmu(ktrr_begin, ktrr_end);
-
-#if DEVELOPMENT || DEBUG
-out:
-#endif
-
-#if defined(KERNEL_INTEGRITY_CTRR)
-       {
-               /* wake any threads blocked on cluster master lockdown */
-               cpu_data_t *cdp;
-               uint64_t mpidr_el1_value;
-
-               cdp = getCpuDatap();
-               MRS(mpidr_el1_value, "MPIDR_EL1");
-               cdp->cpu_cluster_id = (mpidr_el1_value & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT;
-               assert(cdp->cpu_cluster_id < __ARM_CLUSTER_COUNT__);
-               ctrr_cluster_locked[cdp->cpu_cluster_id] = 1;
-               thread_wakeup(&ctrr_cluster_locked[cdp->cpu_cluster_id]);
-       }
-#endif
-       /* now we can run lockdown handler */
-       ml_lockdown_run_handler();
-}
-
-#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
-
  void
  machine_startup(__unused boot_args * args)
  {
-       int boot_arg;
-
  #if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
         if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
                 gFastIPI = 1;
         }
-
-       PE_parse_boot_argn("fastipitimeout", &deferred_ipi_timer_ns, sizeof(deferred_ipi_timer_ns));
  #endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
  
-#if CONFIG_NONFATAL_ASSERTS
-       PE_parse_boot_argn("assert", &mach_assert, sizeof(mach_assert));
-#endif
-
-       if (PE_parse_boot_argn("preempt", &boot_arg, sizeof(boot_arg))) {
-               default_preemption_rate = boot_arg;
-       }
-       if (PE_parse_boot_argn("bg_preempt", &boot_arg, sizeof(boot_arg))) {
-               default_bg_preemption_rate = boot_arg;
-       }
-
-       PE_parse_boot_argn("yield_delay_us", &yield_delay_us, sizeof(yield_delay_us));
-
         machine_conf();
  
         /*
@@ -669,21 +405,27 @@ machine_startup(__unused boot_args * args)
         /* NOTREACHED */
  }
  
+typedef void (*invalidate_fn_t)(void);
+
+static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
+
+void set_invalidate_hmac_function(invalidate_fn_t fn);
+
  void
-machine_lockdown_preflight(void)
+set_invalidate_hmac_function(invalidate_fn_t fn)
  {
-#if CONFIG_KERNEL_INTEGRITY
-
-#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
-       rorgn_stash_range();
-#endif
+       if (NULL != invalidate_hmac_function) {
+               panic("Invalidate HMAC function already set");
+       }
  
-#endif
+       invalidate_hmac_function = fn;
  }
  
  void
  machine_lockdown(void)
  {
+       arm_vm_prot_finalize(PE_state.bootArgs);
+
  #if CONFIG_KERNEL_INTEGRITY
  #if KERNEL_INTEGRITY_WT
         /* Watchtower
@@ -714,8 +456,16 @@ machine_lockdown(void)
  
  
  #endif /* CONFIG_KERNEL_INTEGRITY */
+
+
+       if (NULL != invalidate_hmac_function) {
+               invalidate_hmac_function();
+       }
+
+       lockdown_done = 1;
  }
  
+
  char           *
  machine_boot_info(
         __unused char *buf,
@@ -724,26 +474,6 @@ machine_boot_info(
         return PE_boot_args();
  }
  
-void
-machine_conf(void)
-{
-       /*
-        * This is known to be inaccurate. mem_size should always be capped at 2 GB
-        */
-       machine_info.memory_size = (uint32_t)mem_size;
-}
-
-void
-machine_init(void)
-{
-       debug_log_init();
-       clock_config();
-       is_clock_configured = TRUE;
-       if (debug_enabled) {
-               pmap_map_globals();
-       }
-}
-
  void
  slave_machine_init(__unused void *param)
  {
@@ -764,46 +494,6 @@ machine_processor_shutdown(
         return Shutdown_context(doshutdown, processor);
  }
  
-/*
- *     Routine:        ml_init_max_cpus
- *     Function:
- */
-void
-ml_init_max_cpus(unsigned int max_cpus)
-{
-       boolean_t       current_state;
-
-       current_state = ml_set_interrupts_enabled(FALSE);
-       if (max_cpus_initialized != MAX_CPUS_SET) {
-               machine_info.max_cpus = max_cpus;
-               machine_info.physical_cpu_max = max_cpus;
-               machine_info.logical_cpu_max = max_cpus;
-               if (max_cpus_initialized == MAX_CPUS_WAIT) {
-                       thread_wakeup((event_t) &max_cpus_initialized);
-               }
-               max_cpus_initialized = MAX_CPUS_SET;
-       }
-       (void) ml_set_interrupts_enabled(current_state);
-}
-
-/*
- *     Routine:        ml_get_max_cpus
- *     Function:
- */
-unsigned int
-ml_get_max_cpus(void)
-{
-       boolean_t       current_state;
-
-       current_state = ml_set_interrupts_enabled(FALSE);
-       if (max_cpus_initialized != MAX_CPUS_SET) {
-               max_cpus_initialized = MAX_CPUS_WAIT;
-               assert_wait((event_t) &max_cpus_initialized, THREAD_UNINT);
-               (void) thread_block(THREAD_CONTINUE_NULL);
-       }
-       (void) ml_set_interrupts_enabled(current_state);
-       return machine_info.max_cpus;
-}
  
  /*
   *      Routine:        ml_init_lock_timeout
@@ -841,6 +531,30 @@ ml_init_lock_timeout(void)
                 nanoseconds_to_absolutetime(10 * NSEC_PER_USEC, &abstime);
         }
         MutexSpin = abstime;
+       low_MutexSpin = MutexSpin;
+
+
+       /*
+        * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
+        * real_ncpus is not set at this time
+        *
+        * NOTE: active spinning is disabled in arm. It can be activated
+        * by setting high_MutexSpin through the sysctl.
+        */
+       high_MutexSpin = low_MutexSpin;
+
+       nanoseconds_to_absolutetime(MAX_WFE_HINT_INTERVAL_US * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
+}
+
+/*
+ * This is called when all of the ml_processor_info_t structures have been
+ * initialized and all the processors have been started through processor_start().
+ *
+ * Required by the scheduler subsystem.
+ */
+void
+ml_cpu_init_completed(void)
+{
  }
  
  /*
@@ -999,10 +713,7 @@ ml_install_interrupt_handler(
         cpu_data_ptr->interrupt_handler = handler;
         cpu_data_ptr->interrupt_refCon = refCon;
  
-       cpu_data_ptr->interrupts_enabled = TRUE;
         (void) ml_set_interrupts_enabled(current_state);
-
-       initialize_screen(NULL, kPEAcquireScreen);
  }
  
  /*
@@ -1046,6 +757,85 @@ ml_init_timebase(
         }
  }
  
+#define ML_READPROP_MANDATORY UINT64_MAX
+
+static uint64_t
+ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
+{
+       void const *prop;
+       unsigned int propSize;
+
+       if (SecureDTGetProperty(entry, propertyName, &prop, &propSize) == kSuccess) {
+               if (propSize == sizeof(uint8_t)) {
+                       return *((uint8_t const *)prop);
+               } else if (propSize == sizeof(uint16_t)) {
+                       return *((uint16_t const *)prop);
+               } else if (propSize == sizeof(uint32_t)) {
+                       return *((uint32_t const *)prop);
+               } else if (propSize == sizeof(uint64_t)) {
+                       return *((uint64_t const *)prop);
+               } else {
+                       panic("CPU property '%s' has bad size %u", propertyName, propSize);
+               }
+       } else {
+               if (default_value == ML_READPROP_MANDATORY) {
+                       panic("Missing mandatory property '%s'", propertyName);
+               }
+               return default_value;
+       }
+}
+
+static boolean_t
+ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
+{
+       uint64_t const *prop;
+       unsigned int propSize;
+
+       if (SecureDTGetProperty(entry, propertyName, (void const **)&prop, &propSize) != kSuccess) {
+               return FALSE;
+       }
+
+       if (propSize != sizeof(uint64_t) * 2) {
+               panic("Wrong property size for %s", propertyName);
+       }
+
+       *pa_ptr = prop[0];
+       *len_ptr = prop[1];
+       return TRUE;
+}
+
+static boolean_t
+ml_is_boot_cpu(const DTEntry entry)
+{
+       void const *prop;
+       unsigned int propSize;
+
+       if (SecureDTGetProperty(entry, "state", &prop, &propSize) != kSuccess) {
+               panic("unable to retrieve state for cpu");
+       }
+
+       if (strncmp((char const *)prop, "running", propSize) == 0) {
+               return TRUE;
+       } else {
+               return FALSE;
+       }
+}
+
+static void
+ml_read_chip_revision(unsigned int *rev __unused)
+{
+       // The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
+#ifdef APPLE_ARM64_ARCH_FAMILY
+       DTEntry         entryP;
+
+       if ((SecureDTFindEntry("name", "arm-io", &entryP) == kSuccess)) {
+               *rev = (unsigned int)ml_readprop(entryP, "chip-revision", CPU_VERSION_UNKNOWN);
+       } else {
+               *rev = CPU_VERSION_UNKNOWN;
+       }
+#endif
+}
+
  void
  ml_parse_cpu_topology(void)
  {
@@ -1054,59 +844,148 @@ ml_parse_cpu_topology(void)
         uint32_t cpu_boot_arg;
         int err;
  
+       int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
+       int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
         cpu_boot_arg = MAX_CPUS;
-
         PE_parse_boot_argn("cpus", &cpu_boot_arg, sizeof(cpu_boot_arg));
  
-       err = DTLookupEntry(NULL, "/cpus", &entry);
+       err = SecureDTLookupEntry(NULL, "/cpus", &entry);
         assert(err == kSuccess);
  
-       err = DTInitEntryIterator(entry, &iter);
+       err = SecureDTInitEntryIterator(entry, &iter);
         assert(err == kSuccess);
  
-       while (kSuccess == DTIterateEntries(&iter, &child)) {
-               unsigned int propSize;
-               void *prop = NULL;
-               int cpu_id = avail_cpus++;
+       for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
+               cluster_offsets[i] = -1;
+               cluster_phys_to_logical[i] = -1;
+               cluster_max_cpu_phys_id[i] = 0;
+       }
+
+       while (kSuccess == SecureDTIterateEntries(&iter, &child)) {
+               boolean_t is_boot_cpu = ml_is_boot_cpu(child);
  
-               if (kSuccess == DTGetProperty(child, "cpu-id", &prop, &propSize)) {
-                       cpu_id = *((int32_t*)prop);
+               // If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
+               // been added to the topology struct yet, and we only have one slot left, then skip
+               // every other non-boot CPU in order to leave room for the boot CPU.
+               //
+               // e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
+               // array will list CPU0, CPU1, and CPU4.  CPU2-CPU3 and CPU5-CPUn will be omitted.
+               if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
+                       continue;
+               }
+               if (topology_info.num_cpus >= cpu_boot_arg) {
+                       break;
                 }
  
-               assert(cpu_id < MAX_CPUS);
-               assert(cpu_phys_ids[cpu_id] == (uint32_t)-1);
+               ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
  
-               if (boot_cpu == -1) {
-                       if (kSuccess != DTGetProperty(child, "state", &prop, &propSize)) {
-                               panic("unable to retrieve state for cpu %d", cpu_id);
-                       }
+               cpu->cpu_id = topology_info.num_cpus++;
+               assert(cpu->cpu_id < MAX_CPUS);
+               topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
  
-                       if (strncmp((char*)prop, "running", propSize) == 0) {
-                               boot_cpu = cpu_id;
-                       }
-               }
-               if (kSuccess != DTGetProperty(child, "reg", &prop, &propSize)) {
-                       panic("unable to retrieve physical ID for cpu %d", cpu_id);
+               cpu->die_id = (int)ml_readprop(child, "die-id", 0);
+               topology_info.max_die_id = MAX(topology_info.max_die_id, cpu->die_id);
+
+               cpu->phys_id = (uint32_t)ml_readprop(child, "reg", ML_READPROP_MANDATORY);
+
+               cpu->l2_access_penalty = (uint32_t)ml_readprop(child, "l2-access-penalty", 0);
+               cpu->l2_cache_size = (uint32_t)ml_readprop(child, "l2-cache-size", 0);
+               cpu->l2_cache_id = (uint32_t)ml_readprop(child, "l2-cache-id", 0);
+               cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
+               cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
+
+               ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
+               ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
+               ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
+               cpu->cluster_type = CLUSTER_TYPE_SMP;
+
+               int cluster_type = (int)ml_readprop(child, "cluster-type", 0);
+               if (cluster_type == 'E') {
+                       cpu->cluster_type = CLUSTER_TYPE_E;
+               } else if (cluster_type == 'P') {
+                       cpu->cluster_type = CLUSTER_TYPE_P;
                 }
  
-               cpu_phys_ids[cpu_id] = *((uint32_t*)prop);
+               /*
+                * Since we want to keep a linear cluster ID space, we cannot just rely
+                * on the value provided by EDT. Instead, use the MPIDR value to see if we have
+                * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
+                */
+#if HAS_CLUSTER
+               uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
+#else
+               uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
+#endif
+               assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
+               cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
+                   topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
+
+               assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
+
+               ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
+               if (cluster->num_cpus == 0) {
+                       assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
+
+                       topology_info.num_clusters++;
+                       topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
+
+                       cluster->cluster_id = cpu->cluster_id;
+                       cluster->cluster_type = cpu->cluster_type;
+                       cluster->first_cpu_id = cpu->cpu_id;
+                       assert(cluster_phys_to_logical[phys_cluster_id] == -1);
+                       cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
+
+                       // Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
+                       // If we wind up with a bunch of these, we might want to create separate per-cluster
+                       // EDT nodes and have the CPU nodes reference them through a phandle.
+                       ml_read_reg_range(child, "acc-impl-reg", &cluster->acc_IMPL_pa, &cluster->acc_IMPL_len);
+                       ml_read_reg_range(child, "cpm-impl-reg", &cluster->cpm_IMPL_pa, &cluster->cpm_IMPL_len);
+               }
  
-               if ((cpu_id > max_cpu_number) && ((cpu_id == boot_cpu) || (avail_cpus <= cpu_boot_arg))) {
-                       max_cpu_number = cpu_id;
+#if HAS_CLUSTER
+               if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
+                       cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
                 }
-       }
+#endif
  
-       if (avail_cpus > cpu_boot_arg) {
-               avail_cpus = cpu_boot_arg;
-       }
+               cpu->die_cluster_id = (int)ml_readprop(child, "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
+               cpu->cluster_core_id = (int)ml_readprop(child, "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
+
+               cluster->num_cpus++;
+               cluster->cpu_mask |= 1ULL << cpu->cpu_id;
  
-       if (avail_cpus == 0) {
-               panic("No cpus found!");
+               if (is_boot_cpu) {
+                       assert(topology_info.boot_cpu == NULL);
+                       topology_info.boot_cpu = cpu;
+                       topology_info.boot_cluster = cluster;
+               }
         }
  
-       if (boot_cpu == -1) {
-               panic("unable to determine boot cpu!");
+#if HAS_CLUSTER
+       /*
+        * Build the cluster offset array, ensuring that the region reserved
+        * for each physical cluster contains enough entries to be indexed
+        * by the maximum physical CPU ID (AFF0) within the cluster.
+        */
+       unsigned int cur_cluster_offset = 0;
+       for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
+               if (cluster_phys_to_logical[i] != -1) {
+                       cluster_offsets[i] = cur_cluster_offset;
+                       cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
+               }
         }
+       assert(cur_cluster_offset <= MAX_CPUS);
+#else
+       /*
+        * For H10, there are really 2 physical clusters, but they are not separated
+        * into distinct ACCs.  AFF1 therefore always reports 0, and AFF0 numbering
+        * is linear across both clusters.   For the purpose of MPIDR_EL1-based indexing,
+        * treat H10 and earlier devices as though they contain a single cluster.
+        */
+       cluster_offsets[0] = 0;
+#endif
+       assert(topology_info.boot_cpu != NULL);
+       ml_read_chip_revision(&topology_info.chip_revision);
  
         /*
          * Set TPIDRRO_EL0 to indicate the correct cpu number, as we may
@@ -1116,95 +995,162 @@ ml_parse_cpu_topology(void)
          * per-cpu data object.
          */
         assert(__builtin_arm_rsr64("TPIDRRO_EL0") == 0);
-       __builtin_arm_wsr64("TPIDRRO_EL0", (uint64_t)boot_cpu);
+       __builtin_arm_wsr64("TPIDRRO_EL0", (uint64_t)topology_info.boot_cpu->cpu_id);
+}
+
+const ml_topology_info_t *
+ml_get_topology_info(void)
+{
+       return &topology_info;
+}
+
+void
+ml_map_cpu_pio(void)
+{
+       unsigned int i;
+
+       for (i = 0; i < topology_info.num_cpus; i++) {
+               ml_topology_cpu_t *cpu = &topology_info.cpus[i];
+               if (cpu->cpu_IMPL_pa) {
+                       cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(cpu->cpu_IMPL_pa, cpu->cpu_IMPL_len);
+                       cpu->coresight_regs = (vm_offset_t)ml_io_map(cpu->coresight_pa, cpu->coresight_len);
+               }
+               if (cpu->cpu_UTTDBG_pa) {
+                       cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(cpu->cpu_UTTDBG_pa, cpu->cpu_UTTDBG_len);
+               }
+       }
+
+       for (i = 0; i < topology_info.num_clusters; i++) {
+               ml_topology_cluster_t *cluster = &topology_info.clusters[i];
+               if (cluster->acc_IMPL_pa) {
+                       cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(cluster->acc_IMPL_pa, cluster->acc_IMPL_len);
+               }
+               if (cluster->cpm_IMPL_pa) {
+                       cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(cluster->cpm_IMPL_pa, cluster->cpm_IMPL_len);
+               }
+       }
  }
  
  unsigned int
  ml_get_cpu_count(void)
  {
-       return avail_cpus;
+       return topology_info.num_cpus;
+}
+
+unsigned int
+ml_get_cluster_count(void)
+{
+       return topology_info.num_clusters;
  }
  
  int
  ml_get_boot_cpu_number(void)
  {
-       return boot_cpu;
+       return topology_info.boot_cpu->cpu_id;
  }
  
  cluster_type_t
  ml_get_boot_cluster(void)
  {
-       return boot_cluster;
+       return topology_info.boot_cluster->cluster_type;
  }
  
  int
  ml_get_cpu_number(uint32_t phys_id)
  {
-       for (int log_id = 0; log_id <= ml_get_max_cpu_number(); ++log_id) {
-               if (cpu_phys_ids[log_id] == phys_id) {
-                       return log_id;
+       phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
+
+       for (unsigned i = 0; i < topology_info.num_cpus; i++) {
+               if (topology_info.cpus[i].phys_id == phys_id) {
+                       return i;
                 }
         }
+
         return -1;
  }
  
+int
+ml_get_cluster_number(uint32_t phys_id)
+{
+       int cpu_id = ml_get_cpu_number(phys_id);
+       if (cpu_id < 0) {
+               return -1;
+       }
+
+       ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
+
+       return cpu->cluster_id;
+}
+
+unsigned int
+ml_get_cpu_number_local(void)
+{
+       uint64_t mpidr_el1_value = 0;
+       unsigned cpu_id;
+
+       /* We identify the CPU based on the constant bits of MPIDR_EL1. */
+       MRS(mpidr_el1_value, "MPIDR_EL1");
+       cpu_id = ml_get_cpu_number((uint32_t)mpidr_el1_value);
+
+       assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
+
+       return cpu_id;
+}
+
+int
+ml_get_cluster_number_local()
+{
+       uint64_t mpidr_el1_value = 0;
+       unsigned cluster_id;
+
+       /* We identify the cluster based on the constant bits of MPIDR_EL1. */
+       MRS(mpidr_el1_value, "MPIDR_EL1");
+       cluster_id = ml_get_cluster_number((uint32_t)mpidr_el1_value);
+
+       assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
+
+       return cluster_id;
+}
+
  int
  ml_get_max_cpu_number(void)
  {
-       return max_cpu_number;
+       return topology_info.max_cpu_id;
  }
  
+int
+ml_get_max_cluster_number(void)
+{
+       return topology_info.max_cluster_id;
+}
+
+unsigned int
+ml_get_first_cpu_id(unsigned int cluster_id)
+{
+       return topology_info.clusters[cluster_id].first_cpu_id;
+}
  
  void
  ml_lockdown_init()
  {
-       lockdown_handler_grp = lck_grp_alloc_init("lockdown_handler", NULL);
-       assert(lockdown_handler_grp != NULL);
-
-       lck_mtx_init(&lockdown_handler_lck, lockdown_handler_grp, NULL);
-
-#if defined(KERNEL_INTEGRITY_CTRR)
-       init_ctrr_cpu_start_lock();
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
+       rorgn_stash_range();
  #endif
  }
  
  kern_return_t
  ml_lockdown_handler_register(lockdown_handler_t f, void *this)
  {
-       if (lockdown_handler || !f) {
+       if (!f) {
                 return KERN_FAILURE;
         }
  
-       lck_mtx_lock(&lockdown_handler_lck);
-       lockdown_handler = f;
-       lockdown_this = this;
-
-#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR))
-       lockdown_done = 1;
-       lockdown_handler(this);
-#else
-       if (lockdown_done) {
-               lockdown_handler(this);
-       }
-#endif
-       lck_mtx_unlock(&lockdown_handler_lck);
+       assert(lockdown_done);
+       f(this); // XXX: f this whole function
  
         return KERN_SUCCESS;
  }
  
-void
-ml_lockdown_run_handler()
-{
-       lck_mtx_lock(&lockdown_handler_lck);
-       assert(!lockdown_done);
-
-       lockdown_done = 1;
-       if (lockdown_handler) {
-               lockdown_handler(lockdown_this);
-       }
-       lck_mtx_unlock(&lockdown_handler_lck);
-}
-
  kern_return_t
  ml_processor_register(ml_processor_info_t *in_processor_info,
      processor_t *processor_out, ipi_handler_t *ipi_handler_out,
@@ -1219,7 +1165,7 @@ ml_processor_register(ml_processor_info_t *in_processor_info,
                 return KERN_FAILURE;
         }
  
-       if ((unsigned int)OSIncrementAtomic((SInt32*)&reg_cpu_count) >= avail_cpus) {
+       if ((unsigned)OSIncrementAtomic((SInt32*)&reg_cpu_count) >= topology_info.num_cpus) {
                 return KERN_FAILURE;
         }
  
@@ -1232,7 +1178,7 @@ ml_processor_register(ml_processor_info_t *in_processor_info,
                 is_boot_cpu = TRUE;
         }
  
-       assert(in_processor_info->log_id < MAX_CPUS);
+       assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
  
         this_cpu_datap->cpu_id = in_processor_info->cpu_id;
  
@@ -1242,22 +1188,22 @@ ml_processor_register(ml_processor_info_t *in_processor_info,
         }
  
         if (!is_boot_cpu) {
-               this_cpu_datap->cpu_number = in_processor_info->log_id;
+               this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
  
                 if (cpu_data_register(this_cpu_datap) != KERN_SUCCESS) {
                         goto processor_register_error;
                 }
         }
  
-       this_cpu_datap->cpu_idle_notify = (void *) in_processor_info->processor_idle;
-       this_cpu_datap->cpu_cache_dispatch = in_processor_info->platform_cache_dispatch;
+       this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
+       this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
         nanoseconds_to_absolutetime((uint64_t) in_processor_info->powergate_latency, &this_cpu_datap->cpu_idle_latency);
         this_cpu_datap->cpu_reset_assist = kvtophys(in_processor_info->powergate_stub_addr);
  
-       this_cpu_datap->idle_timer_notify = (void *) in_processor_info->idle_timer;
+       this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
         this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
  
-       this_cpu_datap->platform_error_handler = (void *) in_processor_info->platform_error_handler;
+       this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
         this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
         this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
         this_cpu_datap->cpu_l2_access_penalty = in_processor_info->l2_access_penalty;
@@ -1275,13 +1221,50 @@ ml_processor_register(ml_processor_info_t *in_processor_info,
         this_cpu_datap->cluster_master = is_boot_cpu;
  #endif /* HAS_CLUSTER */
  
+#if !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2)
+       {
+               /* Workaround for the existing scheduler
+                * code, which only supports a limited number of psets.
+                *
+                * To get around that limitation, we distribute all cores into
+                * two psets according to their cluster type, instead of
+                * having a dedicated pset per cluster ID.
+                */
+
+               pset_cluster_type_t pset_cluster_type;
+
+               /* For this workaround, we don't expect seeing anything else
+                * than E or P clusters. */
+               switch (in_processor_info->cluster_type) {
+               case CLUSTER_TYPE_E:
+                       pset_cluster_type = PSET_AMP_E;
+                       break;
+               case CLUSTER_TYPE_P:
+                       pset_cluster_type = PSET_AMP_P;
+                       break;
+               default:
+                       panic("unknown/unsupported cluster type %d", in_processor_info->cluster_type);
+               }
+
+               pset = pset_find_first_by_cluster_type(pset_cluster_type);
+
+               if (pset == NULL) {
+                       panic("no pset for cluster type %d/%d", in_processor_info->cluster_type, pset_cluster_type);
+               }
+
+               kprintf("%s>chosen pset with cluster id %d cluster type %d for core:\n",
+                   __FUNCTION__, pset->pset_cluster_id, pset->pset_cluster_type);
+       }
+#else /* !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) */
         pset = pset_find(in_processor_info->cluster_id, processor_pset(master_processor));
+#endif /* !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) */
+
         assert(pset != NULL);
         kprintf("%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
  
+       processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
         if (!is_boot_cpu) {
-               processor_init((struct processor *)this_cpu_datap->cpu_processor,
-                   this_cpu_datap->cpu_number, pset);
+               processor_init(processor, this_cpu_datap->cpu_number, pset);
  
                 if (this_cpu_datap->cpu_l2_access_penalty) {
                         /*
@@ -1290,12 +1273,11 @@ ml_processor_register(ml_processor_info_t *in_processor_info,
                          * scheduler, so that threads use the cores with better L2
                          * preferentially.
                          */
-                       processor_set_primary(this_cpu_datap->cpu_processor,
-                           master_processor);
+                       processor_set_primary(processor, master_processor);
                 }
         }
  
-       *processor_out = this_cpu_datap->cpu_processor;
+       *processor_out = processor;
         *ipi_handler_out = cpu_signal_handler;
  #if CPMU_AIC_PMI && MONOTONIC
         *pmi_handler_out = mt_cpmu_aic_pmi;
@@ -1410,6 +1392,13 @@ ml_io_map_wcomb(
         return io_map(phys_addr, size, VM_WIMG_WCOMB);
  }
  
+void
+ml_io_unmap(vm_offset_t addr, vm_size_t sz)
+{
+       pmap_remove(kernel_pmap, addr, addr + sz);
+       kmem_free(kernel_map, addr, sz);
+}
+
  /* boot memory allocation */
  vm_offset_t
  ml_static_malloc(
@@ -1437,14 +1426,30 @@ vm_offset_t
  ml_static_slide(
         vm_offset_t vaddr)
  {
-       return phystokv(vaddr + vm_kernel_slide - gVirtBase + gPhysBase);
+       vm_offset_t slid_vaddr = vaddr + vm_kernel_slide;
+
+       if ((slid_vaddr < vm_kernelcache_base) || (slid_vaddr >= vm_kernelcache_top)) {
+               /* This is only intended for use on kernelcache addresses. */
+               return 0;
+       }
+
+       /*
+        * Because the address is in the kernelcache, we can do a simple
+        * slide calculation.
+        */
+       return slid_vaddr;
  }
  
  vm_offset_t
  ml_static_unslide(
         vm_offset_t vaddr)
  {
-       return ml_static_vtop(vaddr) - gPhysBase + gVirtBase - vm_kernel_slide;
+       if ((vaddr < vm_kernelcache_base) || (vaddr >= vm_kernelcache_top)) {
+               /* This is only intended for use on kernelcache addresses. */
+               return 0;
+       }
+
+       return vaddr - vm_kernel_slide;
  }
  
  extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
@@ -1471,6 +1476,9 @@ ml_static_protect(
         if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
                 panic("ml_static_protect(): WX request on %p", (void *) vaddr);
         }
+       if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
+               panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
+       }
  
         /* Set up the protection bits, and block bits so we can validate block mappings. */
         if (new_prot & VM_PROT_WRITE) {
@@ -1499,8 +1507,8 @@ ml_static_protect(
                         pt_entry_t      ptmp;
  
  #if XNU_MONITOR
-                       assert(!TEST_PAGE_RATIO_4);
                         assert(!pmap_is_monitor(ppn));
+                       assert(!TEST_PAGE_RATIO_4);
  #endif
  
                         tte2 = arm_kva_to_tte(vaddr_cur);
@@ -1552,7 +1560,6 @@ ml_static_protect(
                                 }
                         } else {
                                 ptmp = *pte_p;
-
                                 /* We only need to update the page tables if the protections do not match. */
                                 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
                                         ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
@@ -1579,11 +1586,25 @@ ml_static_protect(
  void
  ml_static_mfree(
         vm_offset_t vaddr,
-       vm_size_t size)
+       vm_size_t   size)
  {
-       vm_offset_t     vaddr_cur;
-       ppnum_t         ppn;
-       uint32_t freed_pages = 0;
+       vm_offset_t vaddr_cur;
+       ppnum_t     ppn;
+       uint32_t    freed_pages = 0;
+       uint32_t    bad_page_cnt = 0;
+       uint32_t    freed_kernelcache_pages = 0;
+
+#if defined(__arm64__) && (DEVELOPMENT || DEBUG)
+       /* For testing hitting a bad ram page */
+       static int count = 0;
+       static int bad_at_cnt = -1;
+       static bool first = true;
+
+       if (first) {
+               (void)PE_parse_boot_argn("bad_static_mfree", &bad_at_cnt, sizeof(bad_at_cnt));
+               first = false;
+       }
+#endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
  
         /* It is acceptable (if bad) to fail to free. */
         if (vaddr < VM_MIN_KERNEL_ADDRESS) {
@@ -1607,24 +1628,33 @@ ml_static_mfree(
                                 panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
                         }
  
-#if 0
-                       /*
-                        * Must NOT tear down the "V==P" mapping for vaddr_cur as the zone alias scheme
-                        * relies on the persistence of these mappings for all time.
-                        */
-                       // pmap_remove(kernel_pmap, (addr64_t) vaddr_cur, (addr64_t) (vaddr_cur + PAGE_SIZE));
-#endif
+#if defined(__arm64__)
+                       bool is_bad = pmap_is_bad_ram(ppn);
+#if DEVELOPMENT || DEBUG
+                       is_bad |= (count++ == bad_at_cnt);
+#endif /* DEVELOPMENT || DEBUG */
+
+                       if (is_bad) {
+                               ++bad_page_cnt;
+                               vm_page_create_retired(ppn);
+                               continue;
+                       }
+#endif /* defined(__arm64__) */
  
                         vm_page_create(ppn, (ppn + 1));
                         freed_pages++;
+                       if (vaddr_cur >= segLOWEST && vaddr_cur < end_kern) {
+                               freed_kernelcache_pages++;
+                       }
                 }
         }
         vm_page_lockspin_queues();
         vm_page_wire_count -= freed_pages;
         vm_page_wire_count_initial -= freed_pages;
+       vm_page_kernelcache_count -= freed_kernelcache_pages;
         vm_page_unlock_queues();
  #if     DEBUG
-       kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn);
+       kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
  #endif
  }
  
@@ -1822,9 +1852,9 @@ ml_set_decrementer(uint32_t dec_value)
         cdp->cpu_decrementer = dec_value;
  
         if (cdp->cpu_set_decrementer_func) {
-               ((void (*)(uint32_t))cdp->cpu_set_decrementer_func)(dec_value);
+               cdp->cpu_set_decrementer_func(dec_value);
         } else {
-               __asm__ volatile ("msr CNTP_TVAL_EL0, %0" : : "r"((uint64_t)dec_value));
+               __builtin_arm_wsr64("CNTV_TVAL_EL0", (uint64_t)dec_value);
         }
  }
  
@@ -1834,10 +1864,10 @@ ml_get_hwclock()
         uint64_t timebase;
  
         // ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
-       // "Reads of CNTPCT[_EL0] can occur speculatively and out of order relative
+       // "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
         // to other instructions executed on the same processor."
         __builtin_arm_isb(ISB_SY);
-       timebase = __builtin_arm_rsr64("CNTPCT_EL0");
+       timebase = __builtin_arm_rsr64("CNTVCT_EL0");
  
         return timebase;
  }
@@ -1848,6 +1878,25 @@ ml_get_timebase()
         return ml_get_hwclock() + getCpuDatap()->cpu_base_timebase;
  }
  
+/*
+ * Get the speculative timebase without an ISB.
+ */
+uint64_t
+ml_get_speculative_timebase()
+{
+       uint64_t timebase;
+
+       timebase = __builtin_arm_rsr64("CNTVCT_EL0");
+
+       return timebase + getCpuDatap()->cpu_base_timebase;
+}
+
+uint64_t
+ml_get_timebase_entropy(void)
+{
+       return ml_get_speculative_timebase();
+}
+
  uint32_t
  ml_get_decrementer()
  {
@@ -1857,11 +1906,11 @@ ml_get_decrementer()
         assert(ml_get_interrupts_enabled() == FALSE);
  
         if (cdp->cpu_get_decrementer_func) {
-               dec = ((uint32_t (*)(void))cdp->cpu_get_decrementer_func)();
+               dec = cdp->cpu_get_decrementer_func();
         } else {
                 uint64_t wide_val;
  
-               __asm__ volatile ("mrs %0, CNTP_TVAL_EL0" : "=r"(wide_val));
+               wide_val = __builtin_arm_rsr64("CNTV_TVAL_EL0");
                 dec = (uint32_t)wide_val;
                 assert(wide_val == (uint64_t)dec);
         }
@@ -1872,24 +1921,8 @@ ml_get_decrementer()
  boolean_t
  ml_get_timer_pending()
  {
-       uint64_t cntp_ctl;
-
-       __asm__ volatile ("mrs %0, CNTP_CTL_EL0" : "=r"(cntp_ctl));
-       return ((cntp_ctl & CNTP_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
-}
-
-boolean_t
-ml_wants_panic_trap_to_debugger(void)
-{
-       boolean_t result = FALSE;
-#if XNU_MONITOR
-       /*
-        * This looks racey, but if we are in the PPL, preemption will be
-        * disabled.
-        */
-       result = ((pmap_get_cpu_data()->ppl_state == PPL_STATE_DISPATCH) && pmap_ppl_locked_down);
-#endif
-       return result;
+       uint64_t cntv_ctl = __builtin_arm_rsr64("CNTV_CTL_EL0");
+       return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
  }
  
  static void
@@ -1907,7 +1940,7 @@ cache_trap_error(thread_t thread, vm_map_address_t fault_addr)
  }
  
  static void
-cache_trap_recover()
+cache_trap_recover(void)
  {
         vm_map_address_t fault_addr;
  
@@ -1920,7 +1953,8 @@ static void
  set_cache_trap_recover(thread_t thread)
  {
  #if defined(HAS_APPLE_PAC)
-       thread->recover = (vm_address_t)ptrauth_auth_and_resign(&cache_trap_recover,
+       void *fun = &cache_trap_recover;
+       thread->recover = (vm_address_t)ptrauth_auth_and_resign(fun,
             ptrauth_key_function_pointer, 0,
             ptrauth_key_function_pointer, ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER));
  #else /* defined(HAS_APPLE_PAC) */
@@ -2053,13 +2087,13 @@ _enable_timebase_event_stream(uint32_t bit_index)
  
         /*
          * If the SOC supports it (and it isn't broken), enable
-        * EL0 access to the physical timebase register.
+        * EL0 access to the timebase registers.
          */
         if (user_timebase_type() != USER_TIMEBASE_NONE) {
-               cntkctl |= CNTKCTL_EL1_PL0PCTEN;
+               cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
         }
  
-       __asm__ volatile ("msr  CNTKCTL_EL1, %0" : : "r"(cntkctl));
+       __builtin_arm_wsr64("CNTKCTL_EL1", cntkctl);
  }
  
  /*
@@ -2068,31 +2102,48 @@ _enable_timebase_event_stream(uint32_t bit_index)
  static void
  _enable_virtual_timer(void)
  {
-       uint64_t cntvctl = CNTP_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
+       uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
  
-       __asm__ volatile ("msr CNTP_CTL_EL0, %0" : : "r"(cntvctl));
+       __builtin_arm_wsr64("CNTV_CTL_EL0", cntvctl);
+       /* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
+       __builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
  }
  
-uint64_t events_per_sec = 0;
-
  void
  fiq_context_init(boolean_t enable_fiq __unused)
  {
-       _enable_timebase_event_stream(fiq_eventi);
-
         /* Interrupts still disabled. */
         assert(ml_get_interrupts_enabled() == FALSE);
         _enable_virtual_timer();
  }
  
  void
-fiq_context_bootstrap(boolean_t enable_fiq)
+wfe_timeout_init(void)
+{
+       _enable_timebase_event_stream(arm64_eventi);
+}
+
+void
+wfe_timeout_configure(void)
  {
-#if defined(APPLE_ARM64_ARCH_FAMILY) || defined(BCM2837)
         /* Could fill in our own ops here, if we needed them */
-       uint64_t        ticks_per_sec, ticks_per_event;
+       uint64_t        ticks_per_sec, ticks_per_event, events_per_sec = 0;
         uint32_t        bit_index;
  
+       if (PE_parse_boot_argn("wfe_events_sec", &events_per_sec, sizeof(events_per_sec))) {
+               if (events_per_sec <= 0) {
+                       events_per_sec = 1;
+               } else if (events_per_sec > USEC_PER_SEC) {
+                       events_per_sec = USEC_PER_SEC;
+               }
+       } else {
+#if defined(ARM_BOARD_WFE_TIMEOUT_NS)
+               events_per_sec = NSEC_PER_SEC / ARM_BOARD_WFE_TIMEOUT_NS;
+#else /* !defined(ARM_BOARD_WFE_TIMEOUT_NS) */
+               /* Default to 1usec (or as close as we can get) */
+               events_per_sec = USEC_PER_SEC;
+#endif /* !defined(ARM_BOARD_WFE_TIMEOUT_NS) */
+       }
         ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
         ticks_per_event = ticks_per_sec / events_per_sec;
         bit_index = flsll(ticks_per_event) - 1; /* Highest bit set */
@@ -2112,11 +2163,8 @@ fiq_context_bootstrap(boolean_t enable_fiq)
                 bit_index--;
         }
  
-       fiq_eventi = bit_index;
-#else
-#error Need a board configuration.
-#endif
-       fiq_context_init(enable_fiq);
+       arm64_eventi = bit_index;
+       wfe_timeout_init();
  }
  
  boolean_t
@@ -2172,13 +2220,11 @@ ml_energy_stat(thread_t t)
  void
  ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
  {
-#if CONFIG_EMBEDDED
         /*
          * For now: update the resource coalition stats of the
          * current thread's coalition
          */
         task_coalition_update_gpu_stats(current_task(), gpu_ns_delta);
-#endif
  }
  
  uint64_t
@@ -2187,7 +2233,8 @@ ml_gpu_stat(__unused thread_t t)
         return 0;
  }
  
-#if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME
+#if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME || HAS_FAST_CNTVCT
+
  static void
  timer_state_event(boolean_t switch_to_kernel)
  {
@@ -2196,8 +2243,8 @@ timer_state_event(boolean_t switch_to_kernel)
                 return;
         }
  
-       processor_data_t *pd = &getCpuDatap()->cpu_processor->processor_data;
-       uint64_t now = ml_get_timebase();
+       processor_t pd = current_processor();
+       uint64_t now = ml_get_speculative_timebase();
  
         timer_stop(pd->current_state, now);
         pd->current_state = (switch_to_kernel) ? &pd->system_state : &pd->user_state;
@@ -2219,7 +2266,7 @@ timer_state_event_kernel_to_user(void)
  {
         timer_state_event(FALSE);
  }
-#endif /* !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME */
+#endif /* !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME || HAS_FAST_CNTVCT */
  
  /*
   * The following are required for parts of the kernel
@@ -2296,14 +2343,14 @@ ex_cb_invoke(
  
  #if defined(HAS_APPLE_PAC)
  void
-ml_task_set_disable_user_jop(task_t task, boolean_t disable_user_jop)
+ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
  {
         assert(task);
         task->disable_user_jop = disable_user_jop;
  }
  
  void
-ml_thread_set_disable_user_jop(thread_t thread, boolean_t disable_user_jop)
+ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
  {
         assert(thread);
         thread->machine.disable_user_jop = disable_user_jop;
@@ -2318,35 +2365,180 @@ ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
                 task->rop_pid = early_random();
         }
  }
-#endif /* defined(HAS_APPLE_PAC) */
  
+/**
+ * jop_pid may be inherited from the parent task or generated inside the shared
+ * region.  Unfortunately these two parameters are available at very different
+ * times during task creation, so we need to split this into two steps.
+ */
+void
+ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit)
+{
+       if (inherit) {
+               task->jop_pid = parent_task->jop_pid;
+       } else {
+               task->jop_pid = ml_default_jop_pid();
+       }
+}
+
+void
+ml_task_set_jop_pid_from_shared_region(task_t task)
+{
+       vm_shared_region_t sr = vm_shared_region_get(task);
+       /*
+        * If there's no shared region, we can assign the key arbitrarily.  This
+        * typically happens when Mach-O image activation failed part of the way
+        * through, and this task is in the middle of dying with SIGKILL anyway.
+        */
+       if (__improbable(!sr)) {
+               task->jop_pid = early_random();
+               return;
+       }
+       vm_shared_region_deallocate(sr);
+
+       /*
+        * Similarly we have to worry about jetsam having killed the task and
+        * already cleared the shared_region_id.
+        */
+       task_lock(task);
+       if (task->shared_region_id != NULL) {
+               task->jop_pid = shared_region_find_key(task->shared_region_id);
+       } else {
+               task->jop_pid = early_random();
+       }
+       task_unlock(task);
+}
+
+void
+ml_thread_set_jop_pid(thread_t thread, task_t task)
+{
+       thread->machine.jop_pid = task->jop_pid;
+}
+#endif /* defined(HAS_APPLE_PAC) */
  
  #if defined(HAS_APPLE_PAC)
+#define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
+       asm volatile ("aut" #_suffix " %[ptr], %[modifier]" : [ptr] "+r"(_ptr) : [modifier] "r"(_modifier));
  
  /*
   * ml_auth_ptr_unchecked: call this instead of ptrauth_auth_data
   * instrinsic when you don't want to trap on auth fail.
   *
   */
-
  void *
  ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier)
  {
         switch (key & 0x3) {
         case ptrauth_key_asia:
-               asm volatile ("autia %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier));
+               _ml_auth_ptr_unchecked(ptr, ia, modifier);
                 break;
         case ptrauth_key_asib:
-               asm volatile ("autib %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier));
+               _ml_auth_ptr_unchecked(ptr, ib, modifier);
                 break;
         case ptrauth_key_asda:
-               asm volatile ("autda %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier));
+               _ml_auth_ptr_unchecked(ptr, da, modifier);
                 break;
         case ptrauth_key_asdb:
-               asm volatile ("autdb %[ptr], %[modifier]" : [ptr] "+r"(ptr) : [modifier] "r"(modifier));
+               _ml_auth_ptr_unchecked(ptr, db, modifier);
                 break;
         }
  
         return ptr;
  }
  #endif /* defined(HAS_APPLE_PAC) */
+
+#ifdef CONFIG_XNUPOST
+void
+ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
+{
+       thread_t thread = current_thread();
+       thread->machine.expected_fault_handler = expected_fault_handler;
+       thread->machine.expected_fault_addr = expected_fault_addr;
+}
+
+void
+ml_expect_fault_end(void)
+{
+       thread_t thread = current_thread();
+       thread->machine.expected_fault_handler = NULL;
+       thread->machine.expected_fault_addr = 0;
+}
+#endif /* CONFIG_XNUPOST */
+
+void
+ml_hibernate_active_pre(void)
+{
+#if HIBERNATION
+       if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
+
+               hibernate_rebuild_vm_structs();
+       }
+#endif /* HIBERNATION */
+}
+
+void
+ml_hibernate_active_post(void)
+{
+#if HIBERNATION
+       if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
+               hibernate_machine_init();
+               hibernate_vm_lock_end();
+               current_cpu_datap()->cpu_hibernate = 0;
+       }
+#endif /* HIBERNATION */
+}
+
+/**
+ * Return back a machine-dependent array of address space regions that should be
+ * reserved by the VM (pre-mapped in the address space). This will prevent user
+ * processes from allocating or deallocating from within these regions.
+ *
+ * @param vm_is64bit True if the process has a 64-bit address space.
+ * @param regions An out parameter representing an array of regions to reserve.
+ *
+ * @return The number of reserved regions returned through `regions`.
+ */
+size_t
+ml_get_vm_reserved_regions(bool vm_is64bit, struct vm_reserved_region **regions)
+{
+       assert(regions != NULL);
+
+       /**
+        * Reserved regions only apply to 64-bit address spaces. This is because
+        * we only expect to grow the maximum user VA address on 64-bit address spaces
+        * (we've essentially already reached the max for 32-bit spaces). The reserved
+        * regions should safely fall outside of the max user VA for 32-bit processes.
+        */
+       if (vm_is64bit) {
+               *regions = vm_reserved_regions;
+               return ARRAY_COUNT(vm_reserved_regions);
+       } else {
+               /* Don't reserve any VA regions on arm64_32 processes. */
+               *regions = NULL;
+               return 0;
+       }
+}
+/* These WFE recommendations are expected to be updated on a relatively
+ * infrequent cadence, possibly from a different cluster, hence
+ * false cacheline sharing isn't expected to be material
+ */
+static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
+
+uint32_t
+ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
+{
+       assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
+       assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
+       os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
+       return 0; /* Success */
+}
+
+uint64_t
+ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
+{
+       /* This and its consumer does not synchronize vis-a-vis updates
+        * of the recommendation; races are acceptable.
+        */
+       uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
+       return wfet;
+}