]> git.saurik.com Git - apple/xnu.git/commitdiff
xnu-6153.81.5.tar.gz macos-10153 v6153.81.5
authorApple <opensource@apple.com>
Thu, 30 Apr 2020 21:35:40 +0000 (21:35 +0000)
committerApple <opensource@apple.com>
Thu, 30 Apr 2020 21:35:40 +0000 (21:35 +0000)
88 files changed:
bsd/conf/files.arm64
bsd/dev/arm64/dtrace_isa.c
bsd/dev/arm64/sysctl.c
bsd/kern/kern_core.c
bsd/kern/kern_memorystatus.c
bsd/kern/kern_sysctl.c
bsd/kern/kern_xxx.c
bsd/kern/policy_check.c
bsd/kern/sys_generic.c
bsd/net/dlil.c
bsd/net/if_headless.c
bsd/net/if_ipsec.c
bsd/netinet/flow_divert.c
bsd/netinet6/ip6_input.c
bsd/nfs/nfs_node.c
bsd/nfs/nfs_subs.c
bsd/nfs/nfs_vfsops.c
bsd/nfs/nfs_vnops.c
bsd/pthread/pthread_workqueue.c
bsd/tests/bsd_tests.c
bsd/tests/ctrr_test_sysctl.c
bsd/vfs/vfs_syscalls.c
config/MasterVersion
iokit/Kernel/IOUserClient.cpp
osfmk/arm/arm_init.c
osfmk/arm/atomic.h
osfmk/arm/cpu.c
osfmk/arm/cpu_common.c
osfmk/arm/cpu_data_internal.h
osfmk/arm/cpu_internal.h
osfmk/arm/cpuid.c
osfmk/arm/cpuid.h
osfmk/arm/data.s
osfmk/arm/machine_routines.h
osfmk/arm/machine_routines_common.c
osfmk/arm/pmap.c
osfmk/arm/pmap.h
osfmk/arm/proc_reg.h
osfmk/arm64/arm_vm_init.c
osfmk/arm64/cpu.c
osfmk/arm64/exception_asm.h
osfmk/arm64/genassym.c
osfmk/arm64/locore.s
osfmk/arm64/machine_routines.c
osfmk/arm64/machine_routines_asm.s
osfmk/arm64/monotonic.h
osfmk/arm64/monotonic_arm64.c
osfmk/arm64/pinst.s
osfmk/arm64/platform_tests.c
osfmk/arm64/proc_reg.h
osfmk/arm64/sleh.c
osfmk/arm64/start.s
osfmk/conf/files
osfmk/kdp/ml/arm/kdp_machdep.c
osfmk/kern/clock.c
osfmk/kern/host.c
osfmk/kern/ipc_kobject.c
osfmk/kern/ipc_kobject.h
osfmk/kern/kern_stackshot.c
osfmk/kern/mk_timer.c
osfmk/kern/processor.h
osfmk/kern/sched_amp.c [new file with mode: 0644]
osfmk/kern/sched_amp_common.c [new file with mode: 0644]
osfmk/kern/sched_amp_common.h [new file with mode: 0644]
osfmk/kern/sched_clutch.c
osfmk/kern/sched_clutch.h
osfmk/kern/sched_prim.c
osfmk/kern/sched_prim.h
osfmk/kern/task.c
osfmk/kern/thread.h
osfmk/mach/mach_host.defs
osfmk/mach/machine.h
osfmk/man/index.html [new file with mode: 0644]
osfmk/tests/pmap_tests.c
osfmk/vm/vm_map.c
osfmk/vm/vm_pageout.c
pexpert/arm/pe_identify_machine.c
pexpert/pexpert/arm64/arm64_common.h
pexpert/pexpert/arm64/board_config.h
san/kasan-arm64.c
security/mac_framework.h
security/mac_policy.h
security/mac_process.c
tests/Makefile
tests/atm_diagnostic_flag.c
tests/atm_diagnostic_flag.entitlements [new file with mode: 0644]
tests/atm_diagnostic_flag_entitled.c [new file with mode: 0644]
tests/monotonic_uncore.c [new file with mode: 0644]

index 7761c03acbc1027e78507c3a28db5dad8d53a42a..9d631c5fb57390f1312b1acde61ca49b2da9a976 100644 (file)
@@ -9,6 +9,9 @@ bsd/dev/arm/unix_signal.c       standard
 
 bsd/dev/arm64/cpu_in_cksum.s   standard
 
+#if defined(KERNEL_INTEGRITY_CTRR)
+bsd/tests/ctrr_test_sysctl.c           optional config_xnupost
+#endif /* defined(KERNEL_INTEGRITY_CTRR) */
 
 bsd/dev/arm64/dtrace_isa.c     optional config_dtrace
 bsd/dev/arm64/dtrace_subr_arm.c        optional config_dtrace
index 5714f7971009eb63ba13d94a5ce5971e101e7026..56d1729f5614407a79fbdd2e67e0a633854f965f 100644 (file)
@@ -54,6 +54,10 @@ typedef arm_saved_state_t savearea_t;
 extern lck_attr_t       *dtrace_lck_attr;
 extern lck_grp_t        *dtrace_lck_grp;
 
+#if XNU_MONITOR
+extern void * pmap_stacks_start;
+extern void * pmap_stacks_end;
+#endif
 
 struct frame {
        struct frame *backchain;
@@ -455,6 +459,14 @@ zero:
        }
 }
 
+#if XNU_MONITOR
+static inline boolean_t
+dtrace_frame_in_ppl_stack(struct frame * fp)
+{
+       return ((void *)fp >= pmap_stacks_start) &&
+              ((void *)fp < pmap_stacks_end);
+}
+#endif
 
 void
 dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes,
@@ -464,6 +476,9 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes,
        struct frame   *nextfp, *minfp, *stacktop;
        int             depth = 0;
        int             on_intr;
+#if XNU_MONITOR
+       int             on_ppl_stack;
+#endif
        int             last = 0;
        uintptr_t       pc;
        uintptr_t       caller = CPU->cpu_dtrace_caller;
@@ -471,6 +486,11 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes,
        if ((on_intr = CPU_ON_INTR(CPU)) != 0) {
                stacktop = (struct frame *) dtrace_get_cpu_int_stack_top();
        }
+#if XNU_MONITOR
+       else if ((on_ppl_stack = dtrace_frame_in_ppl_stack(fp))) {
+               stacktop = (struct frame *) pmap_stacks_end;
+       }
+#endif
        else {
                stacktop = (struct frame *) (dtrace_get_kernel_stack(current_thread()) + kernel_stack_size);
        }
@@ -496,6 +516,14 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes,
                                if (arm_kern_regs) {
                                        nextfp = (struct frame *)(saved_state64(arm_kern_regs)->fp);
 
+#if XNU_MONITOR
+                                       on_ppl_stack = dtrace_frame_in_ppl_stack(nextfp);
+
+                                       if (on_ppl_stack) {
+                                               minfp = pmap_stacks_start;
+                                               stacktop = pmap_stacks_end;
+                                       } else
+#endif
                                        {
                                                vm_offset_t kstack_base = dtrace_get_kernel_stack(current_thread());
 
@@ -517,6 +545,30 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes,
                                        last = 1;
                                }
                        } else {
+#if XNU_MONITOR
+                               if ((!on_ppl_stack) && dtrace_frame_in_ppl_stack(nextfp)) {
+                                       /*
+                                        * We are switching from the kernel stack
+                                        * to the PPL stack.
+                                        */
+                                       on_ppl_stack = 1;
+                                       minfp = pmap_stacks_start;
+                                       stacktop = pmap_stacks_end;
+                               } else if (on_ppl_stack) {
+                                       /*
+                                        * We could be going from the PPL stack
+                                        * to the kernel stack.
+                                        */
+                                       vm_offset_t kstack_base = dtrace_get_kernel_stack(current_thread());
+
+                                       minfp = (struct frame *)kstack_base;
+                                       stacktop = (struct frame *)(kstack_base + kernel_stack_size);
+
+                                       if (nextfp <= minfp || nextfp >= stacktop) {
+                                               last = 1;
+                                       }
+                               } else
+#endif
                                {
                                        /*
                                         * This is the last frame we can process; indicate
index d67aa4a0b386025db2c5e27af85319511b869e7e..fd7055cb7aca1c7cc999ea5d1ddf322be25b7aa6 100644 (file)
@@ -46,6 +46,30 @@ SYSCTL_PROC(_machdep, OID_AUTO, wake_conttime,
     0, 0, sysctl_wake_conttime, "I",
     "Continuous Time at the last wakeup");
 
+#if defined(HAS_IPI)
+static int
+cpu_signal_deferred_timer(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int new_value = 0;
+       int changed   = 0;
+
+       int old_value = (int)ml_cpu_signal_deferred_get_timer();
+
+       int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
+
+       if (error == 0 && changed) {
+               ml_cpu_signal_deferred_adjust_timer((uint64_t)new_value);
+       }
+
+       return error;
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, deferred_ipi_timeout,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0,
+    cpu_signal_deferred_timer, "I", "Deferred IPI timeout (nanoseconds)");
+
+#endif /* defined(HAS_IPI) */
 
 /*
  * For source compatibility, here's some machdep.cpu mibs that
index 77c38ccbeb4ad29c8d809a6d43a4160f0d82d2a7..46fbd3ee55282275697dcb5964bb774b9812929c 100644 (file)
 
 #include <security/audit/audit.h>
 
+#if CONFIG_MACF
+#include <security/mac_framework.h>
+#endif /* CONFIG_MACF */
+
 #if CONFIG_CSR
 #include <sys/codesign.h>
 #include <sys/csr.h>
@@ -221,7 +225,7 @@ collectth_state(thread_t th_act, void *tirp)
  *                             coredump_flags  Extra options (ignore rlimit, run fsync)
  *
  * Returns:    0                               Success
- *             EFAULT                          Failed
+ *             !0                              Failure errno
  *
  * IMPORTANT:  This function can only be called on the current process, due
  *             to assumptions below; see variable declaration section for
@@ -252,7 +256,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
        int             error1 = 0;
        char            stack_name[MAXCOMLEN + 6];
        char            *alloced_name = NULL;
-       char            *name;
+       char            *name = NULL;
        mythread_state_flavor_t flavors[MAX_TSTATE_FLAVORS];
        vm_size_t       mapsize;
        int             i;
@@ -276,11 +280,16 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
            ((sugid_coredump == 0) &&   /* Not dumping SUID/SGID binaries */
            ((kauth_cred_getsvuid(cred) != kauth_cred_getruid(cred)) ||
            (kauth_cred_getsvgid(cred) != kauth_cred_getrgid(cred))))) {
-#if CONFIG_AUDIT
-               audit_proc_coredump(core_proc, NULL, EFAULT);
-#endif
-               return EFAULT;
+               error = EFAULT;
+               goto out2;
+       }
+
+#if CONFIG_MACF
+       error = mac_proc_check_dump_core(core_proc);
+       if (error != 0) {
+               goto out2;
        }
+#endif
 
 #if CONFIG_CSR
        /* If the process is restricted, CSR isn't configured to allow
@@ -289,10 +298,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
        if (cs_restricted(core_proc) &&
            csr_check(CSR_ALLOW_TASK_FOR_PID) &&
            csr_check(CSR_ALLOW_APPLE_INTERNAL)) {
-#if CONFIG_AUDIT
-               audit_proc_coredump(core_proc, NULL, EFAULT);
-#endif
-               return EFAULT;
+               error = EPERM;
+               goto out2;
        }
 #endif
 
@@ -306,7 +313,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
 
        if (((coredump_flags & COREDUMP_IGNORE_ULIMIT) == 0) &&
            (mapsize >= core_proc->p_rlimit[RLIMIT_CORE].rlim_cur)) {
-               return EFAULT;
+               error = EFAULT;
+               goto out2;
        }
 
        (void) task_suspend_internal(task);
index afc9271dc01e625bec98b6aee0817067eb95cd86..81f60d5e88e6b377c4a196fee6430dfca13acbb3 100644 (file)
@@ -1321,7 +1321,11 @@ int max_jetsam_threads = JETSAM_THREADS_LIMIT;
  * - Raise the jetsam threshold ("clear-the-deck")
  * - Enabled parallel jetsam on eligible devices
  */
+#if __AMP__
+int fast_jetsam_enabled = 1;
+#else /* __AMP__ */
 int fast_jetsam_enabled = 0;
+#endif /* __AMP__ */
 
 /* Routine to find the jetsam state structure for the current jetsam thread */
 static inline struct jetsam_thread_state *
index 0c10a3ac48f9d25b87769f37e4e0a989abe177db..5d2fcee095c3e03ebab8c1eadcdc2dc44e16b250 100644 (file)
@@ -2132,6 +2132,28 @@ SYSCTL_PROC(_kern_perfcontrol_callout, OID_AUTO, update_cycles,
     (void *)PERFCONTROL_STAT_CYCLES, PERFCONTROL_CALLOUT_STATE_UPDATE,
     sysctl_perfcontrol_callout_stat, "I", "");
 
+#if __AMP__
+extern int sched_amp_idle_steal;
+SYSCTL_INT(_kern, OID_AUTO, sched_amp_idle_steal,
+    CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
+    &sched_amp_idle_steal, 0, "");
+extern int sched_amp_spill_steal;
+SYSCTL_INT(_kern, OID_AUTO, sched_amp_spill_steal,
+    CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
+    &sched_amp_spill_steal, 0, "");
+extern int sched_amp_spill_count;
+SYSCTL_INT(_kern, OID_AUTO, sched_amp_spill_count,
+    CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
+    &sched_amp_spill_count, 0, "");
+extern int sched_amp_spill_deferred_ipi;
+SYSCTL_INT(_kern, OID_AUTO, sched_amp_spill_deferred_ipi,
+    CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
+    &sched_amp_spill_deferred_ipi, 0, "");
+extern int sched_amp_pcores_preempt_immediate_ipi;
+SYSCTL_INT(_kern, OID_AUTO, sched_amp_pcores_preempt_immediate_ipi,
+    CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
+    &sched_amp_pcores_preempt_immediate_ipi, 0, "");
+#endif /* __AMP__ */
 #endif /* __arm__ || __arm64__ */
 
 #if __arm64__
index ee65deb6c97bf6f7ce27dc6439afb261cb940023..a07457cd6d6f146b50e47a28f006579db150d5da 100644 (file)
@@ -116,8 +116,11 @@ reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval)
        }
 
        if (uap->opt & RB_PANIC && uap->msg != USER_ADDR_NULL) {
-               if (copyinstr(uap->msg, (void *)message, sizeof(message), (size_t *)&dummy)) {
+               int copy_error = copyinstr(uap->msg, (void *)message, sizeof(message), (size_t *)&dummy);
+               if (copy_error != 0 && copy_error != ENAMETOOLONG) {
                        strncpy(message, "user space RB_PANIC message copyin failed", sizeof(message) - 1);
+               } else {
+                       message[sizeof(message) - 1] = '\0';
                }
        }
 
index de77a23be622a9211f113e50446e1e5ec2a8e8f8..ba02e1540c759904aaebd7f1ee1fa59852a6ce8e 100644 (file)
@@ -121,7 +121,7 @@ common_hook(void)
        return rv;
 }
 
-#if (MAC_POLICY_OPS_VERSION != 58)
+#if (MAC_POLICY_OPS_VERSION != 59)
 # error "struct mac_policy_ops doesn't match definition in mac_policy.h"
 #endif
 /*
@@ -322,9 +322,9 @@ const static struct mac_policy_ops policy_ops = {
        CHECK_SET_HOOK(proc_check_setlcid)
        CHECK_SET_HOOK(proc_check_signal)
        CHECK_SET_HOOK(proc_check_wait)
+       CHECK_SET_HOOK(proc_check_dump_core)
 
        .mpo_reserved5 = (mpo_reserved_hook_t *)common_hook,
-       .mpo_reserved6 = (mpo_reserved_hook_t *)common_hook,
 
        CHECK_SET_HOOK(socket_check_accept)
        CHECK_SET_HOOK(socket_check_accepted)
index d9fb9d1f9cec0fb2ebf1a188b0bf928767506a30..bd2d1ad526c62446acec2259e285a978ea1948b7 100644 (file)
@@ -4024,6 +4024,104 @@ SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params,
 #endif /* CONFIG_MACH_BRIDGE_RECV_TIME */
 
 #if DEVELOPMENT || DEBUG
+#if __AMP__
+#include <pexpert/pexpert.h>
+extern int32_t sysctl_get_bound_cpuid(void);
+extern void sysctl_thread_bind_cpuid(int32_t cpuid);
+static int
+sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+
+       if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
+               return ENOENT;
+       }
+
+       int32_t cpuid = sysctl_get_bound_cpuid();
+
+       int32_t new_value;
+       int changed;
+       int error = sysctl_io_number(req, cpuid, sizeof cpuid, &new_value, &changed);
+       if (error) {
+               return error;
+       }
+
+       if (changed) {
+               sysctl_thread_bind_cpuid(new_value);
+       }
+
+       return error;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, sysctl_kern_sched_thread_bind_cpu, "I", "");
+
+extern char sysctl_get_bound_cluster_type(void);
+extern void sysctl_thread_bind_cluster_type(char cluster_type);
+static int
+sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       char buff[4];
+
+       if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
+               return ENOENT;
+       }
+
+       int error = SYSCTL_IN(req, buff, 1);
+       if (error) {
+               return error;
+       }
+       char cluster_type = buff[0];
+
+       if (!req->newptr) {
+               goto out;
+       }
+
+       sysctl_thread_bind_cluster_type(cluster_type);
+out:
+       cluster_type = sysctl_get_bound_cluster_type();
+       buff[0] = cluster_type;
+
+       return SYSCTL_OUT(req, buff, 1);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, sysctl_kern_sched_thread_bind_cluster_type, "A", "");
+
+extern char sysctl_get_task_cluster_type(void);
+extern void sysctl_task_set_cluster_type(char cluster_type);
+static int
+sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       char buff[4];
+
+       if (!PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
+               return ENOENT;
+       }
+
+       int error = SYSCTL_IN(req, buff, 1);
+       if (error) {
+               return error;
+       }
+       char cluster_type = buff[0];
+
+       if (!req->newptr) {
+               goto out;
+       }
+
+       sysctl_task_set_cluster_type(cluster_type);
+out:
+       cluster_type = sysctl_get_task_cluster_type();
+       buff[0] = cluster_type;
+
+       return SYSCTL_OUT(req, buff, 1);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, sysctl_kern_sched_task_set_cluster_type, "A", "");
+#endif /* __AMP__ */
 #endif /* DEVELOPMENT || DEBUG */
 
 extern uint32_t task_exc_guard_default;
index 8acee164d7e49b5a07134f78bba886e32cbafe7c..d38af044c1eed58024808316bd431ae3e6f1bbb0 100644 (file)
@@ -78,6 +78,7 @@
 #include <net/if_llatbl.h>
 #include <net/net_api_stats.h>
 #include <net/if_ports_used.h>
+#include <net/if_vlan_var.h>
 #include <netinet/in.h>
 #if INET
 #include <netinet/in_var.h>
index f7ebb17764d18f686a92dac68376b1c7e1349dac..02c9350963ef274a2ddcfba0dd59a1bb56fef7e5 100644 (file)
@@ -25,6 +25,8 @@
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
+extern void if_headless_init(void);
+
 void
 if_headless_init(void)
 {
index 756272a891aa0a18898a32d7673b3caf634d072b..0d1af9f9d2cb693624be3f96f92739e35b6ceeb4 100644 (file)
@@ -234,8 +234,6 @@ struct ipsec_pcb {
 #define IPSEC_FLAGS_KPIPE_ALLOCATED 1
 
 /* data movement refcounting functions */
-static boolean_t ipsec_data_move_begin(struct ipsec_pcb *pcb);
-static void ipsec_data_move_end(struct ipsec_pcb *pcb);
 static void ipsec_wait_data_move_drain(struct ipsec_pcb *pcb);
 
 /* Data path states */
@@ -2705,6 +2703,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref,
                bpfattach(pcb->ipsec_ifp, DLT_NULL, 0);
        }
 
+#if IPSEC_NEXUS
        /*
         * Mark the data path as ready.
         * If kpipe nexus is being used then the data path is marked ready only when a kpipe channel is connected.
@@ -2714,6 +2713,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref,
                IPSEC_SET_DATA_PATH_READY(pcb);
                lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
        }
+#endif
 
        /* The interfaces resoures allocated, mark it as running */
        ifnet_set_flags(pcb->ipsec_ifp, IFF_RUNNING, IFF_RUNNING);
@@ -4083,34 +4083,6 @@ ipsec_set_ip6oa_for_interface(ifnet_t interface, struct ip6_out_args *ip6oa)
        }
 }
 
-static boolean_t
-ipsec_data_move_begin(struct ipsec_pcb *pcb)
-{
-       boolean_t ret = 0;
-
-       lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock);
-       if ((ret = IPSEC_IS_DATA_PATH_READY(pcb))) {
-               pcb->ipsec_pcb_data_move++;
-       }
-       lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
-
-       return ret;
-}
-
-static void
-ipsec_data_move_end(struct ipsec_pcb *pcb)
-{
-       lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock);
-       VERIFY(pcb->ipsec_pcb_data_move > 0);
-       /*
-        * if there's no more thread moving data, wakeup any
-        * drainers that's blocked waiting for this.
-        */
-       if (--pcb->ipsec_pcb_data_move == 0 && pcb->ipsec_pcb_drainers > 0) {
-               wakeup(&(pcb->ipsec_pcb_data_move));
-       }
-       lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock);
-}
 
 static void
 ipsec_data_move_drain(struct ipsec_pcb *pcb)
index 84f3f4f483e4b76050665e4f5feed971ef7a0722..5a5ab0961968eff80d17434cf173062412df746c 100644 (file)
@@ -2304,7 +2304,7 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet
        struct flow_divert_trie new_trie;
        int insert_error = 0;
        size_t nodes_mem_size;
-       int prefix_count = 0;
+       int prefix_count = -1;
        int signing_id_count = 0;
        size_t trie_memory_size = 0;
 
@@ -2320,9 +2320,10 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet
        memset(&new_trie, 0, sizeof(new_trie));
 
        /* Get the number of shared prefixes in the new set of signing ID strings */
-       flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_PREFIX_COUNT, sizeof(prefix_count), &prefix_count, NULL);
+       error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_PREFIX_COUNT, sizeof(prefix_count), &prefix_count, NULL);
 
-       if (prefix_count < 0) {
+       if (prefix_count < 0 || error) {
+               FDLOG(LOG_ERR, &nil_pcb, "Invalid prefix count (%d) or an error occurred while reading the prefix count: %d", prefix_count, error);
                lck_rw_done(&group->lck);
                return;
        }
@@ -2332,7 +2333,12 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet
            cursor >= 0;
            cursor = flow_divert_packet_find_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, &error, 1)) {
                uint32_t sid_size = 0;
-               flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size);
+               error = flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size);
+               if (error || sid_size == 0) {
+                       FDLOG(LOG_ERR, &nil_pcb, "Failed to get the length of the signing identifier at offset %d: %d", cursor, error);
+                       signing_id_count = 0;
+                       break;
+               }
                new_trie.bytes_count += sid_size;
                signing_id_count++;
        }
@@ -2382,6 +2388,7 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet
 
        new_trie.bytes = (uint8_t *)(void *)((uint8_t *)new_trie.memory + nodes_mem_size + child_maps_mem_size);
        new_trie.bytes_free_next = 0;
+       memset(new_trie.bytes, 0, bytes_mem_size);
 
        /* The root is an empty node */
        new_trie.root = trie_node_alloc(&new_trie);
@@ -2391,10 +2398,20 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet
            cursor >= 0;
            cursor = flow_divert_packet_find_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, &error, 1)) {
                uint32_t sid_size = 0;
-               flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size);
+               error = flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size);
+               if (error || sid_size == 0) {
+                       FDLOG(LOG_ERR, &nil_pcb, "Failed to get the length of the signing identifier at offset %d while building: %d", cursor, error);
+                       insert_error = EINVAL;
+                       break;
+               }
                if (new_trie.bytes_free_next + sid_size <= new_trie.bytes_count) {
                        uint16_t new_node_idx;
-                       flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, sid_size, &TRIE_BYTE(&new_trie, new_trie.bytes_free_next), NULL);
+                       error = flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, sid_size, &TRIE_BYTE(&new_trie, new_trie.bytes_free_next), NULL);
+                       if (error) {
+                               FDLOG(LOG_ERR, &nil_pcb, "Failed to read the signing identifier at offset %d: %d", cursor, error);
+                               insert_error = EINVAL;
+                               break;
+                       }
                        new_node_idx = flow_divert_trie_insert(&new_trie, new_trie.bytes_free_next, sid_size);
                        if (new_node_idx == NULL_TRIE_IDX) {
                                insert_error = EINVAL;
index dad053c63c76617a42162830c856145c6f403007..8d9241fc1a37869ff6b1422842f7c32877c07711 100644 (file)
@@ -1956,11 +1956,9 @@ ip6_notify_pmtu(struct inpcb *in6p, struct sockaddr_in6 *dst, u_int32_t *mtu)
        }
 
        if (sbappendaddr(&so->so_rcv, SA(dst), NULL, m_mtu, NULL) == 0) {
-               m_freem(m_mtu);
-               /* XXX: should count statistics */
-       } else {
-               sorwakeup(so);
+               return;
        }
+       sorwakeup(so);
 }
 
 /*
index 8f7da7ea03bff19f3b246f78a8c4c2abc55d01b7..90400cfa5a47f3ccbac7db46edfb33881fd9b4cf 100644 (file)
@@ -561,7 +561,6 @@ loop:
        {
                error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &np->n_vnode);
        }
-notsup:
        if (error) {
                FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
                nfs_node_unlock(np);
@@ -913,7 +912,6 @@ nfs_vnop_reclaim(
 {
        vnode_t vp = ap->a_vp;
        nfsnode_t np = VTONFS(vp);
-       vfs_context_t ctx = ap->a_context;
        struct nfs_open_file *nofp, *nextnofp;
        struct nfs_file_lock *nflp, *nextnflp;
        struct nfs_lock_owner *nlop, *nextnlop;
index 51d151ecfc7c470e32c629468dd47ae1f6695cc8..9c061a4329a8920e67b90b1de87ea7a788510471 100644 (file)
@@ -892,7 +892,7 @@ nfsm_chain_add_v2sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap, uint32_
  */
 int
 nfsm_chain_add_v3sattr_f(
-       struct nfsmount *nmp,
+       __unused struct nfsmount *nmp,
        struct nfsm_chain *nmc,
        struct vnode_attr *vap)
 {
@@ -1124,7 +1124,7 @@ get_auxiliary_groups(kauth_cred_t cred, gid_t groups[NGROUPS], int count)
 }
 
 int
-nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, int auth_type,
+nfsm_rpchead2(__unused struct nfsmount *nmp, int sotype, int prog, int vers, int proc, int auth_type,
     kauth_cred_t cred, struct nfsreq *req, mbuf_t mrest, u_int64_t *xidp, mbuf_t *mreqp)
 {
        mbuf_t mreq, mb;
@@ -1320,7 +1320,7 @@ add_cred:
  */
 int
 nfs_parsefattr(
-       struct nfsmount *nmp,
+       __unused struct nfsmount *nmp,
        struct nfsm_chain *nmc,
        int nfsvers,
        struct nfs_vattr *nvap)
index 67b409bae2d4a6ea720e93b60094ba9aee7b8aa2..902680c6855018c31b848339073c67b1beb54b28 100644 (file)
@@ -526,11 +526,10 @@ nfsmout:
  * Return an NFS volume name from the mntfrom name.
  */
 static void
-nfs_get_volname(struct mount *mp, char *volname, size_t len, vfs_context_t ctx)
+nfs_get_volname(struct mount *mp, char *volname, size_t len, __unused vfs_context_t ctx)
 {
        const char *ptr, *cptr;
        const char *mntfrom = mp->mnt_vfsstat.f_mntfromname;
-       struct nfsmount *nmp = VFSTONFS(mp);
        size_t mflen;
 
 
index abb24e2fb1ccd9f79dbefc1bc3d75dd88339cc23..1df01abc0c60171168911d27c2585b4f698f8da7 100644 (file)
@@ -1238,7 +1238,7 @@ nfs_close(
        struct nfs_open_file *nofp,
        uint32_t accessMode,
        uint32_t denyMode,
-       vfs_context_t ctx)
+       __unused vfs_context_t ctx)
 {
 #if CONFIG_NFS4
        struct nfs_lock_owner *nlop;
@@ -1827,20 +1827,6 @@ nfsmout:
        return error;
 }
 
-static int
-nfs_parse_user_access(
-       mount_t mp,
-       enum vtype type)
-{
-       int user_access = R_OK;
-       if ((vfs_flags(mp) & MNT_RDONLY) == 0) {
-               user_access |= W_OK;
-       }
-       if (type == VDIR) {
-               user_access |= X_OK;
-       }
-       return user_access;
-}
 
 /*
  * NFS getattr call from vfs.
index 5d6361ccae5bf8fd2f47b73af9ddeef38fd40f50..10e17d52e71339ed94260facae0fbcb4d6c51321 100644 (file)
@@ -1878,6 +1878,7 @@ done:
                return fixedpri_rv;
        }
 
+
        return 0;
 }
 
index 6b1d11c5c4c2a8cc408c1eb64627687718e833d8..7ce85fdd793affdac72e34e0f1734e2fd4669064 100644 (file)
@@ -51,6 +51,9 @@ extern kern_return_t arm64_lock_test(void);
 #endif
 kern_return_t kalloc_test(void);
 kern_return_t ipi_test(void);
+#if defined(KERNEL_INTEGRITY_CTRR)
+extern kern_return_t ctrr_test(void);
+#endif
 #if __ARM_PAN_AVAILABLE__
 extern kern_return_t arm64_late_pan_test(void);
 #endif
@@ -63,6 +66,9 @@ struct xnupost_test bsd_post_tests[] = {
 #ifdef __arm64__
        XNUPOST_TEST_CONFIG_BASIC(arm64_lock_test),
 #endif
+#if defined(KERNEL_INTEGRITY_CTRR)
+       XNUPOST_TEST_CONFIG_BASIC(ctrr_test),
+#endif
 #if __ARM_PAN_AVAILABLE__
        XNUPOST_TEST_CONFIG_BASIC(arm64_late_pan_test),
 #endif
index bea84e1ab6d3976490f90e6f0c8286778e151d1f..ffa15504f7fa91addd76c203870bf19731168b42 100644 (file)
 
 #include <sys/sysctl.h>
 
+#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
+extern kern_return_t ctrr_test(void);
+
+static int
+sysctl_run_ctrr_test(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       unsigned int dummy;
+       int error, changed;
+       error = sysctl_io_number(req, 0, sizeof(dummy), &dummy, &changed);
+       if (error || !changed) {
+               return error;
+       }
+       return ctrr_test();
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, run_ctrr_test,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+    0, 0, sysctl_run_ctrr_test, "I", "");
+#endif /* defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) */
index e1497887b2e011700690c12c14e2ca85fdb80001..84627d65ecb57986a1158b06487ec35fc5f493ac 100644 (file)
@@ -2800,6 +2800,7 @@ quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
                return error;
        }
        mp = nd.ni_vp->v_mount;
+       mount_ref(mp, 0);
        vnode_put(nd.ni_vp);
        nameidone(&nd);
 
@@ -2874,6 +2875,7 @@ quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
                break;
        } /* switch */
 
+       mount_drop(mp, 0);
        return error;
 }
 #else
index b123f4ec5dd49fea63d998948e7805290a880753..3a06932c552541acb1dd0635572042d98668f4ec 100644 (file)
@@ -1,4 +1,4 @@
-19.2.0
+19.3.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
index ae3296b4e5d57f89c6dc37d388370975435fc22d..9b3cef8cec56465beeb3491eddb91c646153c9b6 100644 (file)
@@ -2072,7 +2072,8 @@ IOUserClient::_sendAsyncResult64(OSAsyncReference64 reference,
                replyMsg.m.msg64.notifyHdr.size = sizeof(IOAsyncCompletionContent)
                    + numArgs * sizeof(io_user_reference_t);
                replyMsg.m.msg64.notifyHdr.type = kIOAsyncCompletionNotificationType;
-               bcopy(reference, replyMsg.m.msg64.notifyHdr.reference, sizeof(OSAsyncReference64));
+               /* Copy reference except for reference[0], which is left as 0 from the earlier bzero */
+               bcopy(&reference[1], &replyMsg.m.msg64.notifyHdr.reference[1], sizeof(OSAsyncReference64) - sizeof(reference[0]));
 
                replyMsg.m.msg64.asyncContent.result = result;
                if (numArgs) {
@@ -2089,7 +2090,8 @@ IOUserClient::_sendAsyncResult64(OSAsyncReference64 reference,
                    + numArgs * sizeof(uint32_t);
                replyMsg.m.msg32.notifyHdr.type = kIOAsyncCompletionNotificationType;
 
-               for (idx = 0; idx < kOSAsyncRefCount; idx++) {
+               /* Skip reference[0] which is left as 0 from the earlier bzero */
+               for (idx = 1; idx < kOSAsyncRefCount; idx++) {
                        replyMsg.m.msg32.notifyHdr.reference[idx] = REF32(reference[idx]);
                }
 
index 9f2b60169165682de9b68261bf1f327491e51a74..965ba291d5a20344bcba8bb5bd8cc2e8a1607df0 100644 (file)
@@ -101,7 +101,16 @@ int             debug_task;
 boolean_t up_style_idle_exit = 0;
 
 
+#if HAS_NEX_PG
+uint32_t nex_pg = 1;
+extern void set_nex_pg(void);
+#endif
 
+#if HAS_BP_RET
+/* Enable both branch target retention (0x2) and branch direction retention (0x1) across sleep */
+uint32_t bp_ret = 3;
+extern void set_bp_ret(void);
+#endif
 
 #if INTERRUPT_MASKED_DEBUG
 boolean_t interrupt_masked_debug = 1;
@@ -433,7 +442,15 @@ arm_init(
        PE_parse_boot_argn("interrupt_masked_debug_timeout", &interrupt_masked_timeout, sizeof(interrupt_masked_timeout));
 #endif
 
+#if HAS_NEX_PG
+       PE_parse_boot_argn("nexpg", &nex_pg, sizeof(nex_pg));
+       set_nex_pg(); // Apply NEX powergating settings to boot CPU
+#endif
 
+#if HAS_BP_RET
+       PE_parse_boot_argn("bpret", &bp_ret, sizeof(bp_ret));
+       set_bp_ret(); // Apply branch predictor retention settings to boot CPU
+#endif
 
        PE_parse_boot_argn("immediate_NMI", &force_immediate_debug_halt, sizeof(force_immediate_debug_halt));
 
@@ -629,6 +646,14 @@ arm_init_cpu(
        mt_wake_per_core();
 #endif /* MONOTONIC && defined(__arm64__) */
 
+#if defined(KERNEL_INTEGRITY_CTRR)
+       if (cpu_data_ptr->cluster_master) {
+               lck_spin_lock(&ctrr_cpu_start_lck);
+               ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id] = 1;
+               thread_wakeup(&ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id]);
+               lck_spin_unlock(&ctrr_cpu_start_lck);
+       }
+#endif
 
        slave_main(NULL);
 }
index a6b4c2b8c6f568609bb3e47b4f82589fafff1c06..0382aa23349723c06bb2f8bb3c907ca063ba8006 100644 (file)
 #define OS_ATOMIC_USE_LLSC  0
 #endif
 
+#if defined(__ARM_ARCH_8_4__) && defined(__arm64__)
+/* on armv8.4 16-byte aligned load/store pair is atomic */
+#undef os_atomic_load_is_plain
+#define os_atomic_load_is_plain(p) \
+               (sizeof(*(p)) <= 16 && _Alignof(typeof(*(p))) >= sizeof(*(p)))
+#endif
 
 /*
  * On armv7 & arm64, we do provide fine grained dependency injection, so
index 72e8c780048ac24c1779447851976500f99c4f1e..e641e72d9404b4b4466a506c9a08d8cf1248f1bd 100644 (file)
@@ -377,6 +377,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
        cpu_data_ptr->cpu_CLW_active = 0x1UL;
 #endif
 
+#if !XNU_MONITOR
        pmap_cpu_data_t * pmap_cpu_data_ptr = &cpu_data_ptr->cpu_pmap_cpu_data;
 
        pmap_cpu_data_ptr->cpu_user_pmap = (struct pmap *) NULL;
@@ -386,6 +387,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
        for (i = 0; i < (sizeof(pmap_cpu_data_ptr->cpu_asid_high_bits) / sizeof(*pmap_cpu_data_ptr->cpu_asid_high_bits)); i++) {
                pmap_cpu_data_ptr->cpu_asid_high_bits[i] = 0;
        }
+#endif
        cpu_data_ptr->halt_status = CPU_NOT_HALTED;
 }
 
@@ -421,7 +423,9 @@ cpu_start(int cpu)
                cpu_data_ptr = CpuDataEntries[cpu].cpu_data_vaddr;
                cpu_data_ptr->cpu_reset_handler = (vm_offset_t) start_cpu_paddr;
 
+#if !XNU_MONITOR
                cpu_data_ptr->cpu_pmap_cpu_data.cpu_user_pmap = NULL;
+#endif
 
                if (cpu_data_ptr->cpu_processor->startup_thread != THREAD_NULL) {
                        first_thread = cpu_data_ptr->cpu_processor->startup_thread;
index 327434ece8c53a963414d7afbd9471b7cefd21a9..9d972f6e4e61a609b5d535dd8f57f820739bd04d 100644 (file)
@@ -68,6 +68,9 @@ unsigned int    real_ncpus = 1;
 boolean_t       idle_enable = FALSE;
 uint64_t        wake_abstime = 0x0ULL;
 
+#if defined(HAS_IPI)
+extern unsigned int gFastIPI;
+#endif /* defined(HAS_IPI) */
 
 cpu_data_t *
 cpu_datap(int cpu)
@@ -419,9 +422,25 @@ cpu_signal_internal(cpu_data_t *target_proc,
 
        if (!(target_proc->cpu_signal & SIGPdisabled)) {
                if (defer) {
+#if defined(HAS_IPI)
+                       if (gFastIPI) {
+                               ml_cpu_signal_deferred(target_proc->cpu_phys_id);
+                       } else {
+                               PE_cpu_signal_deferred(getCpuDatap()->cpu_id, target_proc->cpu_id);
+                       }
+#else
                        PE_cpu_signal_deferred(getCpuDatap()->cpu_id, target_proc->cpu_id);
+#endif /* defined(HAS_IPI) */
                } else {
+#if defined(HAS_IPI)
+                       if (gFastIPI) {
+                               ml_cpu_signal(target_proc->cpu_phys_id);
+                       } else {
+                               PE_cpu_signal(getCpuDatap()->cpu_id, target_proc->cpu_id);
+                       }
+#else
                        PE_cpu_signal(getCpuDatap()->cpu_id, target_proc->cpu_id);
+#endif /* defined(HAS_IPI) */
                }
        }
 
@@ -449,7 +468,15 @@ cpu_signal_cancel(cpu_data_t *target_proc)
 {
        /* TODO: Should we care about the state of a core as far as squashing deferred IPIs goes? */
        if (!(target_proc->cpu_signal & SIGPdisabled)) {
+#if defined(HAS_IPI)
+               if (gFastIPI) {
+                       ml_cpu_signal_retract(target_proc->cpu_phys_id);
+               } else {
+                       PE_cpu_signal_cancel(getCpuDatap()->cpu_id, target_proc->cpu_id);
+               }
+#else
                PE_cpu_signal_cancel(getCpuDatap()->cpu_id, target_proc->cpu_id);
+#endif /* defined(HAS_IPI) */
        }
 }
 
index 8b29c711a7c60225e6cd586c18ef23ed0117f3a6..98eac98c5b02eda767c7ad7dee6813a23ad44ff6 100644 (file)
@@ -271,7 +271,9 @@ typedef struct cpu_data {
        uint32_t                                cpu_l3_id;
        uint32_t                                cpu_l3_size;
 
+#if !XNU_MONITOR
        struct pmap_cpu_data                    cpu_pmap_cpu_data;
+#endif
        dbgwrap_thread_state_t                  halt_state;
        enum {
                CPU_NOT_HALTED = 0,
index 7a98926000e169d54cd097de7dd5c0c61218050e..8e4a31454475b45bb32aaab147b6fbc7702b95bd 100644 (file)
@@ -74,5 +74,10 @@ extern unsigned int real_ncpus;
 extern void arm64_ipi_test(void);
 #endif /* defined(CONFIG_XNUPOST) && __arm64__ */
 
+#if defined(KERNEL_INTEGRITY_CTRR)
+extern void init_ctrr_cpu_start_lock(void);
+extern lck_spin_t ctrr_cpu_start_lck;
+extern bool ctrr_cluster_locked[__ARM_CLUSTER_COUNT__];
+#endif /* defined(KERNEL_INTEGRITY_CTRR) */
 
 #endif  /* _ARM_CPU_INTERNAL_H_ */
index 73f9b0d83417ce766b017477eb9a45e661ba90e1..f976aea35e48c6a57d2b01b18c9fa9273a60fdb3 100644 (file)
@@ -185,6 +185,12 @@ cpuid_get_cpufamily(void)
                case CPU_PART_TEMPEST_ARUBA:
                        cpufamily = CPUFAMILY_ARM_VORTEX_TEMPEST;
                        break;
+#ifndef RC_HIDE_XNU_LIGHTNING
+               case CPU_PART_LIGHTNING:
+               case CPU_PART_THUNDER:
+                       cpufamily = CPUFAMILY_ARM_LIGHTNING_THUNDER;
+                       break;
+#endif /* !RC_HIDE_XNU_LIGHTNING */
                default:
                        cpufamily = CPUFAMILY_UNKNOWN;
                        break;
index 559cde9fcbfeb9f7f9f72138739d02ddd408633d..74aac691d69055da0965ee0167c62c6ebb1a6b42 100644 (file)
@@ -154,6 +154,14 @@ typedef union {
 /* H11G e-Core (ARMv8 architecture) */
 #define CPU_PART_TEMPEST_ARUBA      0x11
 
+#ifndef RC_HIDE_XNU_LIGHTNING
+/* H12 p-Core (ARMv8 architecture) */
+#define CPU_PART_LIGHTNING          0x12
+
+/* H12 e-Core (ARMv8 architecture) */
+#define CPU_PART_THUNDER            0x13
+
+#endif /* !RC_HIDE_XNU_LIGHTNING */
 
 /* Cache type identification */
 
index 917e68c2f5ad54e9f4b88c0f8314cc27ebc537ad..b7e66378a76572abc3a4526b08c472819e2bf227 100644 (file)
@@ -106,14 +106,14 @@ LEXT(vfptrash_data)
 #if __arm64__
         .section __DATA, __const
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
 /* reserve space for read only page tables */
         .align 14
 LEXT(ropagetable_begin)
         .space 14*16*1024,0
 #else
 LEXT(ropagetable_begin)
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
 
 LEXT(ropagetable_end)
 
index db581e897246ca1c34231b59040126de3f8c84c4..759802bdd9f9da10a646d86593193c941d1af156 100644 (file)
@@ -613,10 +613,10 @@ unsigned long           monitor_call(uintptr_t callnum, uintptr_t arg1,
     uintptr_t arg2, uintptr_t arg3);
 #endif /* MONITOR */
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
 void rorgn_stash_range(void);
 void rorgn_lockdown(void);
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
 
 #if __ARM_KERNEL_PROTECT__
 extern void set_vbar_el1(uint64_t);
index 02f73391024b7d2a63fa180ba0e69866b866491d..b433dd658d4502534954164cc2005ba3a053dff7 100644 (file)
@@ -55,7 +55,12 @@ extern boolean_t interrupt_masked_debug;
 extern uint64_t interrupt_masked_timeout;
 #endif
 
+#if !HAS_CONTINUOUS_HWCLOCK
 extern uint64_t mach_absolutetime_asleep;
+#else
+extern uint64_t wake_abstime;
+static uint64_t wake_conttime = UINT64_MAX;
+#endif
 
 static void
 sched_perfcontrol_oncore_default(perfcontrol_state_t new_thread_state __unused, going_on_core_t on __unused)
@@ -686,26 +691,49 @@ ml_get_abstime_offset(void)
 uint64_t
 ml_get_conttime_offset(void)
 {
+#if HAS_CONTINUOUS_HWCLOCK
+       return 0;
+#else
        return rtclock_base_abstime + mach_absolutetime_asleep;
+#endif
 }
 
 uint64_t
 ml_get_time_since_reset(void)
 {
+#if HAS_CONTINUOUS_HWCLOCK
+       if (wake_conttime == UINT64_MAX) {
+               return UINT64_MAX;
+       } else {
+               return mach_continuous_time() - wake_conttime;
+       }
+#else
        /* The timebase resets across S2R, so just return the raw value. */
        return ml_get_hwclock();
+#endif
 }
 
 void
 ml_set_reset_time(__unused uint64_t wake_time)
 {
+#if HAS_CONTINUOUS_HWCLOCK
+       wake_conttime = wake_time;
+#endif
 }
 
 uint64_t
 ml_get_conttime_wake_time(void)
 {
+#if HAS_CONTINUOUS_HWCLOCK
+       /*
+        * For now, we will reconstitute the timebase value from
+        * cpu_timebase_init and use it as the wake time.
+        */
+       return wake_abstime - ml_get_abstime_offset();
+#else /* HAS_CONTINOUS_HWCLOCK */
        /* The wake time is simply our continuous time offset. */
        return ml_get_conttime_offset();
+#endif /* HAS_CONTINOUS_HWCLOCK */
 }
 
 /*
index f40eeb88c51a76757a13b321df5819ab0941d6bf..003be491f679ff098aa08563668e41c3609caee3 100644 (file)
@@ -537,10 +537,135 @@ int pmap_stats_assert = 1;
 #endif /* DEVELOPMENT || DEBUG */
 
 
+#if XNU_MONITOR
+/*
+ * PPL External References.
+ */
+extern vm_offset_t   segPPLDATAB;
+extern unsigned long segSizePPLDATA;
+extern vm_offset_t   segPPLTEXTB;
+extern unsigned long segSizePPLTEXT;
+#if __APRR_SUPPORTED__
+extern vm_offset_t   segPPLTRAMPB;
+extern unsigned long segSizePPLTRAMP;
+extern void ppl_trampoline_start;
+extern void ppl_trampoline_end;
+#endif
+extern vm_offset_t   segPPLDATACONSTB;
+extern unsigned long segSizePPLDATACONST;
+
+
+/*
+ * PPL Global Variables
+ */
+
+#if (DEVELOPMENT || DEBUG)
+/* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
+SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
+#else
+const boolean_t pmap_ppl_disable = FALSE;
+#endif
+
+/* Indicates if the PPL has started applying APRR. */
+boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
+
+/*
+ * The PPL cannot invoke the kernel in order to allocate memory, so we must
+ * maintain a list of free pages that the PPL owns.  The kernel can give the PPL
+ * additional pages.
+ */
+decl_simple_lock_data(, pmap_ppl_free_page_lock MARK_AS_PMAP_DATA);
+void ** pmap_ppl_free_page_list MARK_AS_PMAP_DATA = NULL;
+uint64_t pmap_ppl_free_page_count MARK_AS_PMAP_DATA = 0;
+uint64_t pmap_ppl_pages_returned_to_kernel_count_total = 0;
+
+struct pmap_cpu_data_array_entry pmap_cpu_data_array[MAX_CPUS] MARK_AS_PMAP_DATA;
+
+#ifdef CPU_CLUSTER_OFFSETS
+const uint64_t pmap_cluster_offsets[] = CPU_CLUSTER_OFFSETS;
+_Static_assert((sizeof(pmap_cluster_offsets) / sizeof(pmap_cluster_offsets[0])) == __ARM_CLUSTER_COUNT__,
+    "pmap_cluster_offsets[] count does not match __ARM_CLUSTER_COUNT__");
+#endif
+
+extern void *pmap_stacks_start;
+extern void *pmap_stacks_end;
+SECURITY_READ_ONLY_LATE(pmap_paddr_t) pmap_stacks_start_pa = 0;
+SECURITY_READ_ONLY_LATE(pmap_paddr_t) pmap_stacks_end_pa = 0;
+SECURITY_READ_ONLY_LATE(pmap_paddr_t) ppl_cpu_save_area_start = 0;
+SECURITY_READ_ONLY_LATE(pmap_paddr_t) ppl_cpu_save_area_end = 0;
+
+/* Allocation data/locks for pmap structures. */
+decl_simple_lock_data(, pmap_free_list_lock MARK_AS_PMAP_DATA);
+SECURITY_READ_ONLY_LATE(unsigned long) pmap_array_count = 0;
+SECURITY_READ_ONLY_LATE(void *) pmap_array_begin = NULL;
+SECURITY_READ_ONLY_LATE(void *) pmap_array_end = NULL;
+SECURITY_READ_ONLY_LATE(pmap_t) pmap_array = NULL;
+pmap_t pmap_free_list MARK_AS_PMAP_DATA = NULL;
+
+/* Allocation data/locks/structs for task ledger structures. */
+#define PMAP_LEDGER_DATA_BYTES \
+       (((sizeof(task_ledgers) / sizeof(int)) * sizeof(struct ledger_entry)) + sizeof(struct ledger))
+
+/*
+ * Maximum number of ledgers allowed are maximum number of tasks
+ * allowed on system plus some more i.e. ~10% of total tasks = 200.
+ */
+#define MAX_PMAP_LEDGERS (MAX_ASID + 200)
+
+typedef struct pmap_ledger_data {
+       char pld_data[PMAP_LEDGER_DATA_BYTES];
+} pmap_ledger_data_t;
+
+typedef struct pmap_ledger {
+       union {
+               struct pmap_ledger_data ple_data;
+               struct pmap_ledger * next;
+       };
+
+       struct pmap_ledger ** back_ptr;
+} pmap_ledger_t;
+
+SECURITY_READ_ONLY_LATE(bool) pmap_ledger_alloc_initialized = false;
+decl_simple_lock_data(, pmap_ledger_lock MARK_AS_PMAP_DATA);
+SECURITY_READ_ONLY_LATE(void *) pmap_ledger_refcnt_begin = NULL;
+SECURITY_READ_ONLY_LATE(void *) pmap_ledger_refcnt_end = NULL;
+SECURITY_READ_ONLY_LATE(os_refcnt_t *) pmap_ledger_refcnt = NULL;
+SECURITY_READ_ONLY_LATE(void *) pmap_ledger_ptr_array_begin = NULL;
+SECURITY_READ_ONLY_LATE(void *) pmap_ledger_ptr_array_end = NULL;
+SECURITY_READ_ONLY_LATE(pmap_ledger_t * *) pmap_ledger_ptr_array = NULL;
+uint64_t pmap_ledger_ptr_array_free_index MARK_AS_PMAP_DATA = 0;
+pmap_ledger_t * pmap_ledger_free_list MARK_AS_PMAP_DATA = NULL;
+
+#define pmap_ledger_debit(p, e, a) ledger_debit_nocheck((p)->ledger, e, a)
+#define pmap_ledger_credit(p, e, a) ledger_credit_nocheck((p)->ledger, e, a)
+
+static inline void
+pmap_check_ledger_fields(ledger_t ledger)
+{
+       if (ledger == NULL) {
+               return;
+       }
+
+       thread_t cur_thread = current_thread();
+       ledger_check_new_balance(cur_thread, ledger, task_ledgers.alternate_accounting);
+       ledger_check_new_balance(cur_thread, ledger, task_ledgers.alternate_accounting_compressed);
+       ledger_check_new_balance(cur_thread, ledger, task_ledgers.internal);
+       ledger_check_new_balance(cur_thread, ledger, task_ledgers.internal_compressed);
+       ledger_check_new_balance(cur_thread, ledger, task_ledgers.page_table);
+       ledger_check_new_balance(cur_thread, ledger, task_ledgers.phys_footprint);
+       ledger_check_new_balance(cur_thread, ledger, task_ledgers.phys_mem);
+       ledger_check_new_balance(cur_thread, ledger, task_ledgers.tkm_private);
+       ledger_check_new_balance(cur_thread, ledger, task_ledgers.wired_mem);
+}
+
+#define pmap_ledger_check_balance(p) pmap_check_ledger_fields((p)->ledger)
+
+#else /* XNU_MONITOR */
 
 #define pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a)
 #define pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a)
 
+#endif /* !XNU_MONITOR */
 
 #if DEVELOPMENT || DEBUG
 int panic_on_unsigned_execute = 0;
@@ -799,6 +924,29 @@ typedef u_int16_t pp_attr_t;
 #define PP_ATTR_REFFAULT                0x1000
 #define PP_ATTR_MODFAULT                0x2000
 
+#if XNU_MONITOR
+/*
+ * Denotes that a page is owned by the PPL.  This is modified/checked with the
+ * PVH lock held, to avoid ownership related races.  This does not need to be a
+ * PP_ATTR bit (as we have the lock), but for now this is a convenient place to
+ * put the bit.
+ */
+#define PP_ATTR_MONITOR                 0x4000
+
+/*
+ * Denotes that a page *cannot* be owned by the PPL.  This is required in order
+ * to temporarily 'pin' kernel pages that are used to store PPL output parameters.
+ * Otherwise a malicious or buggy caller could pass PPL-owned memory for these
+ * parameters and in so doing stage a write gadget against the PPL.
+ */
+#define PP_ATTR_NO_MONITOR              0x8000
+
+/*
+ * All of the bits owned by the PPL; kernel requests to set or clear these bits
+ * are illegal.
+ */
+#define PP_ATTR_PPL_OWNED_BITS          (PP_ATTR_MONITOR | PP_ATTR_NO_MONITOR)
+#endif
 
 SECURITY_READ_ONLY_LATE(pp_attr_t*)     pp_attr_table;
 
@@ -840,6 +988,14 @@ static bitmap_t asid_bitmap[BITMAP_LEN(MAX_ASID)] MARK_AS_PMAP_DATA;
 SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 #endif
 
+#if XNU_MONITOR
+/*
+ * We define our target as 8 pages; enough for 2 page table pages, a PTD page,
+ * and a PV page; in essence, twice as many pages as may be necessary to satisfy
+ * a single pmap_enter request.
+ */
+#define PMAP_MIN_FREE_PPL_PAGES 8
+#endif
 
 #define pa_index(pa)                                                                    \
        (atop((pa) - vm_first_phys))
@@ -1105,6 +1261,25 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 #define pa_clear_reference(x)                                                           \
        pa_clear_bits(x, PP_ATTR_REFERENCED)
 
+#if XNU_MONITOR
+#define pa_set_monitor(x) \
+       pa_set_bits((x), PP_ATTR_MONITOR)
+
+#define pa_clear_monitor(x) \
+       pa_clear_bits((x), PP_ATTR_MONITOR)
+
+#define pa_test_monitor(x) \
+       pa_test_bits((x), PP_ATTR_MONITOR)
+
+#define pa_set_no_monitor(x) \
+       pa_set_bits((x), PP_ATTR_NO_MONITOR)
+
+#define pa_clear_no_monitor(x) \
+       pa_clear_bits((x), PP_ATTR_NO_MONITOR)
+
+#define pa_test_no_monitor(x) \
+       pa_test_bits((x), PP_ATTR_NO_MONITOR)
+#endif
 
 #define IS_INTERNAL_PAGE(pai) \
        ppattr_test_bits(&pp_attr_table[pai], PP_ATTR_INTERNAL)
@@ -1292,11 +1467,61 @@ lck_grp_t pmap_lck_grp;
 #define current_pmap()                                                                  \
        (vm_map_pmap(current_thread()->map))
 
+#if XNU_MONITOR
+/*
+ * PPL-related macros.
+ */
+#define ARRAY_ELEM_PTR_IS_VALID(_ptr_, _elem_size_, _array_begin_, _array_end_) \
+       (((_ptr_) >= (typeof(_ptr_))_array_begin_) && \
+        ((_ptr_) < (typeof(_ptr_))_array_end_) && \
+        !((((void *)(_ptr_)) - ((void *)_array_begin_)) % (_elem_size_)))
+
+#define PMAP_PTR_IS_VALID(x) ARRAY_ELEM_PTR_IS_VALID(x, sizeof(struct pmap), pmap_array_begin, pmap_array_end)
+
+#define USER_PMAP_IS_VALID(x) (PMAP_PTR_IS_VALID(x) && (os_atomic_load(&(x)->ref_count, relaxed) > 0))
+
+#define VALIDATE_USER_PMAP(x)                                                           \
+       if (__improbable(!USER_PMAP_IS_VALID(x)))                                       \
+               panic("%s: invalid pmap %p", __func__, (x));
+
+#define VALIDATE_PMAP(x)                                                                \
+       if (__improbable(((x) != kernel_pmap) && !USER_PMAP_IS_VALID(x)))               \
+               panic("%s: invalid pmap %p", __func__, (x));
+
+#define VALIDATE_LEDGER_PTR(x) \
+       if (__improbable(!ARRAY_ELEM_PTR_IS_VALID(x, sizeof(void *), pmap_ledger_ptr_array_begin, pmap_ledger_ptr_array_end))) \
+               panic("%s: invalid ledger ptr %p", __func__, (x));
+
+#define ARRAY_ELEM_INDEX(x, _elem_size_, _array_begin_) ((uint64_t)((((void *)(x)) - (_array_begin_)) / (_elem_size_)))
+
+static uint64_t
+pmap_ledger_validate(void * ledger)
+{
+       uint64_t array_index;
+       pmap_ledger_t ** ledger_ptr_array_ptr = ((pmap_ledger_t*)ledger)->back_ptr;
+       VALIDATE_LEDGER_PTR(ledger_ptr_array_ptr);
+       array_index = ARRAY_ELEM_INDEX(ledger_ptr_array_ptr, sizeof(pmap_ledger_t *), pmap_ledger_ptr_array_begin);
+
+       if (array_index >= MAX_PMAP_LEDGERS) {
+               panic("%s: ledger %p array index invalid, index was %#llx", __func__, ledger, array_index);
+       }
+
+       pmap_ledger_t *ledger_ptr = *ledger_ptr_array_ptr;
+
+       if (__improbable(ledger_ptr != ledger)) {
+               panic("%s: ledger pointer mismatch, %p != %p", __func__, ledger, ledger_ptr);
+       }
+
+       return array_index;
+}
+
+#else /* XNU_MONITOR */
 
 #define VALIDATE_USER_PMAP(x)
 #define VALIDATE_PMAP(x)
 #define VALIDATE_LEDGER(x)
 
+#endif
 
 #if DEVELOPMENT || DEBUG
 
@@ -1469,9 +1694,43 @@ static void pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes);
 static void pmap_trim_self(pmap_t pmap);
 static void pmap_trim_subord(pmap_t subord);
 
+#if __APRR_SUPPORTED__
+static uint64_t pte_to_xprr_perm(pt_entry_t pte);
+static pt_entry_t xprr_perm_to_pte(uint64_t perm);
+#endif /* __APRR_SUPPORTED__*/
+
+#if XNU_MONITOR
+static pmap_paddr_t pmap_alloc_page_for_kern(void);
+static void pmap_alloc_page_for_ppl(void);
+
+
+/*
+ * This macro generates prototypes for the *_internal functions, which
+ * represent the PPL interface.  When the PPL is enabled, this will also
+ * generate prototypes for the PPL entrypoints (*_ppl), as well as generating
+ * the entrypoints.
+ */
+#define GEN_ASM_NAME(__function_name) _##__function_name##_ppl
+
+#define PMAP_SUPPORT_PROTOTYPES_WITH_ASM_INTERNAL(__return_type, __function_name, __function_args, __function_index, __assembly_function_name) \
+       static __return_type __function_name##_internal __function_args; \
+       extern __return_type __function_name##_ppl __function_args; \
+       __asm__ (".text \n" \
+                ".align 2 \n" \
+                ".globl " #__assembly_function_name "\n" \
+                #__assembly_function_name ":\n" \
+                "mov x15, " #__function_index "\n" \
+                "b _aprr_ppl_enter\n")
+
+#define PMAP_SUPPORT_PROTOTYPES_WITH_ASM(__return_type, __function_name, __function_args, __function_index, __assembly_function_name) \
+       PMAP_SUPPORT_PROTOTYPES_WITH_ASM_INTERNAL(__return_type, __function_name, __function_args, __function_index, __assembly_function_name)
 
+#define PMAP_SUPPORT_PROTOTYPES(__return_type, __function_name, __function_args, __function_index) \
+       PMAP_SUPPORT_PROTOTYPES_WITH_ASM(__return_type, __function_name, __function_args, __function_index, GEN_ASM_NAME(__function_name))
+#else /* XNU_MONITOR */
 #define PMAP_SUPPORT_PROTOTYPES(__return_type, __function_name, __function_args, __function_index) \
        static __return_type __function_name##_internal __function_args
+#endif /* XNU_MONITOR */
 
 PMAP_SUPPORT_PROTOTYPES(
        kern_return_t,
@@ -1628,7 +1887,7 @@ PMAP_SUPPORT_PROTOTYPES(
        void,
        pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
 
-#if MACH_ASSERT
+#if MACH_ASSERT || XNU_MONITOR
 PMAP_SUPPORT_PROTOTYPES(
        void,
        pmap_set_process, (pmap_t pmap,
@@ -1647,12 +1906,22 @@ PMAP_SUPPORT_PROTOTYPES(
        uint64_t size,
        unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
 
+#if XNU_MONITOR
+PMAP_SUPPORT_PROTOTYPES(
+       void,
+       pmap_cpu_data_init, (unsigned int cpu_number), PMAP_CPU_DATA_INIT_INDEX);
+#endif
 
 PMAP_SUPPORT_PROTOTYPES(
        void,
        phys_attribute_set, (ppnum_t pn,
        unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
 
+#if XNU_MONITOR
+PMAP_SUPPORT_PROTOTYPES(
+       void,
+       pmap_mark_page_as_ppl_page, (pmap_paddr_t pa), PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX);
+#endif
 
 PMAP_SUPPORT_PROTOTYPES(
        void,
@@ -1673,6 +1942,11 @@ PMAP_SUPPORT_PROTOTYPES(
        void,
        pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
 
+#if XNU_MONITOR
+PMAP_SUPPORT_PROTOTYPES(
+       uint64_t,
+       pmap_release_ppl_pages_to_kernel, (void), PMAP_RELEASE_PAGES_TO_KERNEL_INDEX);
+#endif
 
 PMAP_SUPPORT_PROTOTYPES(
        void,
@@ -1686,10 +1960,21 @@ PMAP_SUPPORT_PROTOTYPES(
        addr64_t nstart,
        uint64_t size), PMAP_TRIM_INDEX);
 
+#if HAS_APPLE_PAC && XNU_MONITOR
+PMAP_SUPPORT_PROTOTYPES(
+       void *,
+       pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator), PMAP_SIGN_USER_PTR);
+PMAP_SUPPORT_PROTOTYPES(
+       void *,
+       pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator), PMAP_AUTH_USER_PTR);
+#endif /* HAS_APPLE_PAC && XNU_MONITOR */
 
 
 
 
+#if XNU_MONITOR
+static void pmap_mark_page_as_ppl_page(pmap_paddr_t pa);
+#endif
 
 void pmap_footprint_suspend(vm_map_t    map,
     boolean_t   suspend);
@@ -1699,6 +1984,22 @@ PMAP_SUPPORT_PROTOTYPES(
        boolean_t suspend),
        PMAP_FOOTPRINT_SUSPEND_INDEX);
 
+#if XNU_MONITOR
+PMAP_SUPPORT_PROTOTYPES(
+       void,
+       pmap_ledger_alloc_init, (size_t),
+       PMAP_LEDGER_ALLOC_INIT_INDEX);
+
+PMAP_SUPPORT_PROTOTYPES(
+       ledger_t,
+       pmap_ledger_alloc, (void),
+       PMAP_LEDGER_ALLOC_INDEX);
+
+PMAP_SUPPORT_PROTOTYPES(
+       void,
+       pmap_ledger_free, (ledger_t),
+       PMAP_LEDGER_FREE_INDEX);
+#endif
 
 #if CONFIG_PGTRACE
 boolean_t pgtrace_enabled = 0;
@@ -1756,6 +2057,91 @@ long long alloc_pmap_pages_count __attribute__((aligned(8))) = 0LL;
 
 int pt_fake_zone_index = -1;            /* index of pmap fake zone */
 
+#if XNU_MONITOR
+/*
+ * Table of function pointers used for PPL dispatch.
+ */
+const void * const ppl_handler_table[PMAP_COUNT] = {
+       [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
+       [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
+       [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
+       [MAPPING_REPLENISH_INDEX] = mapping_replenish_internal,
+       [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
+       [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
+       [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
+       [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
+       [PMAP_CREATE_INDEX] = pmap_create_options_internal,
+       [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
+       [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
+       [PMAP_EXTRACT_INDEX] = pmap_extract_internal,
+       [PMAP_FIND_PHYS_INDEX] = pmap_find_phys_internal,
+       [PMAP_INSERT_SHAREDPAGE_INDEX] = pmap_insert_sharedpage_internal,
+       [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
+       [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
+       [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
+       [PMAP_NEST_INDEX] = pmap_nest_internal,
+       [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
+       [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
+       [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
+       [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
+       [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
+       [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
+       [PMAP_RETURN_INDEX] = pmap_return_internal,
+       [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
+       [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
+       [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
+       [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
+       [PMAP_SWITCH_INDEX] = pmap_switch_internal,
+       [PMAP_SWITCH_USER_TTB_INDEX] = pmap_switch_user_ttb_internal,
+       [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
+       [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
+       [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
+       [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
+       [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
+       [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
+       [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
+       [PMAP_TRIM_INDEX] = pmap_trim_internal,
+       [PMAP_LEDGER_ALLOC_INIT_INDEX] = pmap_ledger_alloc_init_internal,
+       [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
+       [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
+#if HAS_APPLE_PAC && XNU_MONITOR
+       [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
+       [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
+#endif /* HAS_APPLE_PAC && XNU_MONITOR */
+};
+
+static uint64_t
+pmap_get_ppl_cpu_id(void)
+{
+       uint64_t mpidr_el1_value = 0;
+
+       /* We identify the CPU based on the constant bits of MPIDR_EL1. */
+       MRS(mpidr_el1_value, "MPIDR_EL1");
+
+#ifdef CPU_CLUSTER_OFFSETS
+       uint64_t cluster_id = (mpidr_el1_value & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT;
+       assert(cluster_id < (sizeof(pmap_cluster_offsets) / sizeof(pmap_cluster_offsets[0])));
+
+       /* For multi-cluster configurations, AFF0 reflects the core number within the cluster. */
+       mpidr_el1_value = (mpidr_el1_value & MPIDR_AFF0_MASK) + pmap_cluster_offsets[cluster_id];
+#else
+       /*
+        * AFF2 is not constant (it can change for e-core versus p-core on H9),
+        * so mask it out.
+        */
+       mpidr_el1_value &= MPIDR_AFF0_MASK;
+#endif
+
+       if (mpidr_el1_value > MAX_CPUS) {
+               panic("%s: mpidr_el1_value=%#llx > MAX_CPUS=%#x",
+                   __FUNCTION__, mpidr_el1_value, MAX_CPUS);
+       }
+
+       return mpidr_el1_value;
+}
+
+
+#endif
 
 
 /*
@@ -1766,18 +2152,80 @@ pmap_cpu_data_init_internal(unsigned int cpu_number)
 {
        pmap_cpu_data_t * pmap_cpu_data = pmap_get_cpu_data();
 
+#if XNU_MONITOR
+       /* Verify cacheline-aligned */
+       assert(((vm_offset_t)pmap_cpu_data & ((1 << L2_CLINE) - 1)) == 0);
+       if (pmap_cpu_data->cpu_number != PMAP_INVALID_CPU_NUM) {
+               panic("%s: pmap_cpu_data->cpu_number=%u, "
+                   "cpu_number=%u",
+                   __FUNCTION__, pmap_cpu_data->cpu_number,
+                   cpu_number);
+       }
+#endif
        pmap_cpu_data->cpu_number = cpu_number;
 }
 
 void
 pmap_cpu_data_init(void)
 {
+#if XNU_MONITOR
+       pmap_cpu_data_init_ppl(cpu_number());
+#else
        pmap_cpu_data_init_internal(cpu_number());
+#endif
 }
 
 static void
 pmap_cpu_data_array_init(void)
 {
+#if XNU_MONITOR
+       unsigned int i = 0;
+       pmap_paddr_t ppl_cpu_save_area_cur = 0;
+       pt_entry_t template, *pte_p;
+       vm_offset_t stack_va = (vm_offset_t)pmap_stacks_start + ARM_PGBYTES;
+       assert((pmap_stacks_start != NULL) && (pmap_stacks_end != NULL));
+       pmap_stacks_start_pa = avail_start;
+
+       for (i = 0; i < MAX_CPUS; i++) {
+               for (vm_offset_t cur_va = stack_va; cur_va < (stack_va + PPL_STACK_SIZE); cur_va += ARM_PGBYTES) {
+                       assert(cur_va < (vm_offset_t)pmap_stacks_end);
+                       pte_p = pmap_pte(kernel_pmap, cur_va);
+                       assert(*pte_p == ARM_PTE_EMPTY);
+                       template = pa_to_pte(avail_start) | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) | ARM_PTE_TYPE |
+                           ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT) | xprr_perm_to_pte(XPRR_PPL_RW_PERM);
+#if __ARM_KERNEL_PROTECT__
+                       template |= ARM_PTE_NG;
+#endif /* __ARM_KERNEL_PROTECT__ */
+                       WRITE_PTE(pte_p, template);
+                       __builtin_arm_isb(ISB_SY);
+                       avail_start += ARM_PGBYTES;
+               }
+#if KASAN
+               kasan_map_shadow(stack_va, PPL_STACK_SIZE, false);
+#endif
+               pmap_cpu_data_array[i].cpu_data.cpu_id = i;
+               pmap_cpu_data_array[i].cpu_data.cpu_number = PMAP_INVALID_CPU_NUM;
+               pmap_cpu_data_array[i].cpu_data.ppl_state = PPL_STATE_KERNEL;
+               pmap_cpu_data_array[i].cpu_data.ppl_stack = (void*)(stack_va + PPL_STACK_SIZE);
+               stack_va += (PPL_STACK_SIZE + ARM_PGBYTES);
+       }
+       sync_tlb_flush();
+       pmap_stacks_end_pa = avail_start;
+
+       ppl_cpu_save_area_start = avail_start;
+       ppl_cpu_save_area_end = ppl_cpu_save_area_start;
+       ppl_cpu_save_area_cur = ppl_cpu_save_area_start;
+
+       for (i = 0; i < MAX_CPUS; i++) {
+               while ((ppl_cpu_save_area_end - ppl_cpu_save_area_cur) < sizeof(arm_context_t)) {
+                       avail_start += PAGE_SIZE;
+                       ppl_cpu_save_area_end = avail_start;
+               }
+
+               pmap_cpu_data_array[i].cpu_data.save_area = (arm_context_t *)phystokv(ppl_cpu_save_area_cur);
+               ppl_cpu_save_area_cur += sizeof(arm_context_t);
+       }
+#endif
 
        pmap_cpu_data_init();
 }
@@ -1787,11 +2235,208 @@ pmap_get_cpu_data(void)
 {
        pmap_cpu_data_t * pmap_cpu_data = NULL;
 
+#if XNU_MONITOR
+       uint64_t cpu_id = 0;
+
+       cpu_id = pmap_get_ppl_cpu_id();
+       pmap_cpu_data = &pmap_cpu_data_array[cpu_id].cpu_data;
+
+       if (pmap_cpu_data->cpu_id != cpu_id) {
+               panic("%s: CPU ID mismatch, cpu_id=0x%#llx, pmap_cpu_data->cpu_id=%#llx",
+                   __FUNCTION__, cpu_id, pmap_cpu_data->cpu_id);
+       }
+#else
        pmap_cpu_data = &getCpuDatap()->cpu_pmap_cpu_data;
+#endif
 
        return pmap_cpu_data;
 }
 
+#if XNU_MONITOR
+/*
+ * pmap_set_range_xprr_perm takes a range (specified using start and end) that
+ * falls within the physical aperture.  All mappings within this range have
+ * their protections changed from those specified by the expected_perm to those
+ * specified by the new_perm.
+ */
+static void
+pmap_set_range_xprr_perm(vm_address_t start,
+    vm_address_t end,
+    unsigned int expected_perm,
+    unsigned int new_perm)
+{
+#if (__ARM_VMSA__ == 7)
+#error This function is not supported on older ARM hardware
+#else
+       pmap_t pmap = NULL;
+
+       vm_address_t va = 0;
+       vm_address_t tte_start = 0;
+       vm_address_t tte_end = 0;
+
+       tt_entry_t *tte_p = NULL;
+       pt_entry_t *pte_p = NULL;
+       pt_entry_t *cpte_p = NULL;
+       pt_entry_t *bpte_p = NULL;
+       pt_entry_t *epte_p = NULL;
+
+       tt_entry_t tte = 0;
+       pt_entry_t cpte = 0;
+       pt_entry_t template = 0;
+
+       pmap = kernel_pmap;
+
+       va = start;
+
+       /*
+        * Validate our arguments; any invalid argument will be grounds for a
+        * panic.
+        */
+       if ((start | end) % ARM_PGBYTES) {
+               panic("%s: start or end not page aligned, "
+                   "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                   __FUNCTION__,
+                   (void *)start, (void *)end, new_perm, expected_perm);
+       }
+
+       if (start > end) {
+               panic("%s: start > end, "
+                   "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                   __FUNCTION__,
+                   (void *)start, (void *)end, new_perm, expected_perm);
+       }
+
+       if (start < gVirtBase) {
+               panic("%s: start is before physical aperture, "
+                   "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                   __FUNCTION__,
+                   (void *)start, (void *)end, new_perm, expected_perm);
+       }
+
+       if (end > static_memory_end) {
+               panic("%s: end is after physical aperture, "
+                   "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                   __FUNCTION__,
+                   (void *)start, (void *)end, new_perm, expected_perm);
+       }
+
+       if ((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM)) {
+               panic("%s: invalid XPRR index, "
+                   "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                   __FUNCTION__,
+                   (void *)start, (void *)end, new_perm, expected_perm);
+       }
+
+       /*
+        * Walk over the PTEs for the given range, and set the protections on
+        * those PTEs.
+        */
+       while (va < end) {
+               tte_start = va;
+               tte_end = ((va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr));
+
+               if (tte_end > end) {
+                       tte_end = end;
+               }
+
+               tte_p = pmap_tte(pmap, va);
+
+               /*
+                * The physical aperture should not have holes.
+                * The physical aperture should be contiguous.
+                * Do not make eye contact with the physical aperture.
+                */
+               if (tte_p == NULL) {
+                       panic("%s: physical aperture tte is NULL, "
+                           "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                           __FUNCTION__,
+                           (void *)start, (void *)end, new_perm, expected_perm);
+               }
+
+               tte = *tte_p;
+
+               if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
+                       /*
+                        * Walk over the given L3 page table page and update the
+                        * PTEs.
+                        */
+                       pte_p = (pt_entry_t *)ttetokv(tte);
+                       bpte_p = &pte_p[ptenum(va)];
+                       epte_p = bpte_p + ((tte_end - va) >> pt_attr_leaf_shift(native_pt_attr));
+
+                       for (cpte_p = bpte_p; cpte_p < epte_p;
+                           cpte_p += PAGE_SIZE / ARM_PGBYTES, va += PAGE_SIZE) {
+                               int pai = (int)pa_index(pte_to_pa(*cpte_p));
+                               LOCK_PVH(pai);
+                               cpte = *cpte_p;
+
+                               /*
+                                * Every PTE involved should be valid, should
+                                * not have the hint bit set, and should have
+                                * Every valid PTE involved should
+                                * not have the hint bit set and should have
+                                * the expected APRR index.
+                                */
+                               if ((cpte & ARM_PTE_TYPE_MASK) ==
+                                   ARM_PTE_TYPE_FAULT) {
+                                       panic("%s: physical aperture PTE is invalid, va=%p, "
+                                           "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                                           __FUNCTION__,
+                                           (void *)va,
+                                           (void *)start, (void *)end, new_perm, expected_perm);
+                                       UNLOCK_PVH(pai);
+                                       continue;
+                               }
+
+                               if (cpte & ARM_PTE_HINT_MASK) {
+                                       panic("%s: physical aperture PTE has hint bit set, va=%p, cpte=0x%llx, "
+                                           "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                                           __FUNCTION__,
+                                           (void *)va, cpte,
+                                           (void *)start, (void *)end, new_perm, expected_perm);
+                               }
+
+                               if (pte_to_xprr_perm(cpte) != expected_perm) {
+                                       panic("%s: perm=%llu does not match expected_perm, cpte=0x%llx, "
+                                           "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                                           __FUNCTION__,
+                                           pte_to_xprr_perm(cpte), cpte,
+                                           (void *)start, (void *)end, new_perm, expected_perm);
+                               }
+
+                               template = cpte;
+                               template &= ~ARM_PTE_XPRR_MASK;
+                               template |= xprr_perm_to_pte(new_perm);
+
+                               WRITE_PTE_STRONG(cpte_p, template);
+                               UNLOCK_PVH(pai);
+                       }
+               } else {
+                       panic("%s: tte=0x%llx is not a table type entry, "
+                           "start=%p, end=%p, new_perm=%u, expected_perm=%u",
+                           __FUNCTION__,
+                           tte,
+                           (void *)start, (void *)end, new_perm, expected_perm);
+               }
+
+               va = tte_end;
+       }
+
+       PMAP_UPDATE_TLBS(pmap, start, end, false);
+#endif /* (__ARM_VMSA__ == 7) */
+}
+
+/*
+ * A convenience function for setting protections on a single page.
+ */
+static inline void
+pmap_set_xprr_perm(vm_address_t page_kva,
+    unsigned int expected_perm,
+    unsigned int new_perm)
+{
+       pmap_set_range_xprr_perm(page_kva, page_kva + PAGE_SIZE, expected_perm, new_perm);
+}
+#endif /* XNU_MONITOR */
 
 
 /* TODO */
@@ -1934,6 +2579,203 @@ pmap_pages_reclaim(
        }
 }
 
+#if XNU_MONITOR
+/*
+ * Return a PPL page to the free list.
+ */
+static void
+pmap_give_free_ppl_page(pmap_paddr_t paddr)
+{
+       assert((paddr & ARM_PGMASK) == 0);
+       void ** new_head = (void **)phystokv(paddr);
+       pmap_simple_lock(&pmap_ppl_free_page_lock);
+
+       void * cur_head = pmap_ppl_free_page_list;
+       *new_head = cur_head;
+       pmap_ppl_free_page_list = new_head;
+       pmap_ppl_free_page_count++;
+
+       pmap_simple_unlock(&pmap_ppl_free_page_lock);
+}
+
+/*
+ * Get a PPL page from the free list.
+ */
+static pmap_paddr_t
+pmap_get_free_ppl_page(void)
+{
+       pmap_paddr_t result = 0;
+
+       pmap_simple_lock(&pmap_ppl_free_page_lock);
+
+       if (pmap_ppl_free_page_list != NULL) {
+               void ** new_head = NULL;
+               new_head = *((void**)pmap_ppl_free_page_list);
+               result = kvtophys((vm_offset_t)pmap_ppl_free_page_list);
+               pmap_ppl_free_page_list = new_head;
+               pmap_ppl_free_page_count--;
+       } else {
+               result = 0L;
+       }
+
+       pmap_simple_unlock(&pmap_ppl_free_page_lock);
+       assert((result & ARM_PGMASK) == 0);
+
+       return result;
+}
+
+/*
+ * pmap_mark_page_as_ppl_page claims a page on behalf of the PPL by marking it
+ * as PPL-owned and only allowing the PPL to write to it.
+ */
+MARK_AS_PMAP_TEXT static void
+pmap_mark_page_as_ppl_page_internal(pmap_paddr_t pa)
+{
+       vm_offset_t kva = 0;
+       unsigned int pai = 0;
+       pp_attr_t attr;
+
+       /*
+        * Mark each page that we allocate as belonging to the monitor, as we
+        * intend to use it for monitor-y stuff (page tables, table pages, that
+        * sort of thing).
+        */
+       assert(!TEST_PAGE_RATIO_4);
+
+       if (!pa_valid(pa)) {
+               panic("%s: bad address, "
+                   "pa=%p",
+                   __func__,
+                   (void *)pa);
+       }
+
+       pai = (unsigned int)pa_index(pa);
+       LOCK_PVH(pai);
+
+       /* A page that the PPL already owns can't be given to the PPL. */
+       if (pa_test_monitor(pa)) {
+               panic("%s: page already belongs to PPL, "
+                   "pa=0x%llx",
+                   __FUNCTION__,
+                   pa);
+       }
+       /* The page cannot be mapped outside of the physical aperture. */
+       if (!pmap_verify_free((ppnum_t)atop(pa))) {
+               panic("%s: page is not free, "
+                   "pa=0x%llx",
+                   __FUNCTION__,
+                   pa);
+       }
+
+       do {
+               attr = pp_attr_table[pai];
+               if (attr & PP_ATTR_NO_MONITOR) {
+                       panic("%s: page excluded from PPL, "
+                           "pa=0x%llx",
+                           __FUNCTION__,
+                           pa);
+               }
+       } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_MONITOR, &pp_attr_table[pai]));
+
+       UNLOCK_PVH(pai);
+
+       kva = phystokv(pa);
+       pmap_set_xprr_perm(kva, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
+       bzero((void *)(kva & ~PAGE_MASK), PAGE_SIZE);
+
+       pmap_give_free_ppl_page(pa);
+}
+
+static void
+pmap_mark_page_as_ppl_page(pmap_paddr_t pa)
+{
+       pmap_mark_page_as_ppl_page_ppl(pa);
+}
+
+static void
+pmap_mark_page_as_kernel_page(pmap_paddr_t pa)
+{
+       vm_offset_t kva = 0;
+       unsigned int pai = 0;
+
+       pai = (unsigned int)pa_index(pa);
+       LOCK_PVH(pai);
+
+       if (!pa_test_monitor(pa)) {
+               panic("%s: page is not a PPL page, "
+                   "pa=%p",
+                   __FUNCTION__,
+                   (void *)pa);
+       }
+
+       pa_clear_monitor(pa);
+       UNLOCK_PVH(pai);
+
+       kva = phystokv(pa);
+       pmap_set_xprr_perm(kva, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
+}
+
+MARK_AS_PMAP_TEXT static pmap_paddr_t
+pmap_release_ppl_pages_to_kernel_internal(void)
+{
+       pmap_paddr_t pa = 0;
+
+       if (pmap_ppl_free_page_count <= PMAP_MIN_FREE_PPL_PAGES) {
+               goto done;
+       }
+
+       pa = pmap_get_free_ppl_page();
+
+       if (!pa) {
+               goto done;
+       }
+
+       pmap_mark_page_as_kernel_page(pa);
+
+done:
+       return pa;
+}
+
+static uint64_t
+pmap_release_ppl_pages_to_kernel(void)
+{
+       pmap_paddr_t pa          = 0;
+       vm_page_t    m           = VM_PAGE_NULL;
+       vm_page_t    local_freeq = VM_PAGE_NULL;
+       uint64_t     pmap_ppl_pages_returned_to_kernel_count = 0;
+
+       while (pmap_ppl_free_page_count > PMAP_MIN_FREE_PPL_PAGES) {
+               pa = pmap_release_ppl_pages_to_kernel_ppl();
+
+               if (!pa) {
+                       break;
+               }
+
+               /* If we retrieved a page, add it to the free queue. */
+               vm_object_lock(pmap_object);
+               m = vm_page_lookup(pmap_object, (pa - gPhysBase));
+               assert(m != VM_PAGE_NULL);
+               assert(VM_PAGE_WIRED(m));
+
+               m->vmp_busy = TRUE;
+               m->vmp_snext = local_freeq;
+               local_freeq = m;
+               pmap_ppl_pages_returned_to_kernel_count++;
+               pmap_ppl_pages_returned_to_kernel_count_total++;
+
+               vm_object_unlock(pmap_object);
+       }
+
+       if (local_freeq) {
+               /* We need to hold the object lock for freeing pages. */
+               vm_object_lock(pmap_object);
+               vm_page_free_list(local_freeq, TRUE);
+               vm_object_unlock(pmap_object);
+       }
+
+       return pmap_ppl_pages_returned_to_kernel_count;
+}
+#endif
 
 static kern_return_t
 pmap_pages_alloc(
@@ -1941,6 +2783,30 @@ pmap_pages_alloc(
        unsigned                size,
        unsigned                option)
 {
+#if XNU_MONITOR
+       if (size != PAGE_SIZE) {
+               panic("%s: size != PAGE_SIZE, "
+                   "pa=%p, size=%u, option=%u",
+                   __FUNCTION__,
+                   pa, size, option);
+       }
+
+       if (option & PMAP_PAGES_RECLAIM_NOWAIT) {
+               *pa = pmap_pages_reclaim();
+               assert(*pa);
+               return KERN_SUCCESS;
+       }
+
+       assert(option & PMAP_PAGES_ALLOCATE_NOWAIT);
+
+       *pa = pmap_get_free_ppl_page();
+
+       if (*pa == 0) {
+               return KERN_RESOURCE_SHORTAGE;
+       } else {
+               return KERN_SUCCESS;
+       }
+#else
        vm_page_t       m = VM_PAGE_NULL, m_prev;
 
        if (option & PMAP_PAGES_RECLAIM_NOWAIT) {
@@ -1979,14 +2845,114 @@ pmap_pages_alloc(
                m = NEXT_PAGE(m_prev);
                *(NEXT_PAGE_PTR(m_prev)) = VM_PAGE_NULL;
        }
-       vm_object_unlock(pmap_object);
-
-       OSAddAtomic(size >> PAGE_SHIFT, &inuse_pmap_pages_count);
-       OSAddAtomic64(size >> PAGE_SHIFT, &alloc_pmap_pages_count);
+       vm_object_unlock(pmap_object);
+
+       OSAddAtomic(size >> PAGE_SHIFT, &inuse_pmap_pages_count);
+       OSAddAtomic64(size >> PAGE_SHIFT, &alloc_pmap_pages_count);
+
+       return KERN_SUCCESS;
+#endif
+}
+
+#if XNU_MONITOR
+static pmap_paddr_t
+pmap_alloc_page_for_kern(void)
+{
+       pmap_paddr_t paddr = 0;
+       vm_page_t    m, m_prev;
+
+       while ((m = vm_page_grab()) == VM_PAGE_NULL) {
+               VM_PAGE_WAIT();
+       }
+
+       vm_page_lock_queues();
+       vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
+       vm_page_unlock_queues();
+
+       paddr = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(m));
+
+       if (paddr == 0) {
+               panic("%s: paddr is 0",
+                   __FUNCTION__);
+       }
+
+       vm_object_lock(pmap_object);
+
+       while (m != VM_PAGE_NULL) {
+               vm_page_insert_wired(m, pmap_object, (vm_object_offset_t) ((ptoa(VM_PAGE_GET_PHYS_PAGE(m))) - gPhysBase), VM_KERN_MEMORY_PTE);
+               m_prev = m;
+               m = NEXT_PAGE(m_prev);
+               *(NEXT_PAGE_PTR(m_prev)) = VM_PAGE_NULL;
+       }
+
+       vm_object_unlock(pmap_object);
+
+       OSAddAtomic(1, &inuse_pmap_pages_count);
+       OSAddAtomic64(1, &alloc_pmap_pages_count);
+
+       return paddr;
+}
+
+static void
+pmap_alloc_page_for_ppl(void)
+{
+       pmap_mark_page_as_ppl_page(pmap_alloc_page_for_kern());
+}
+
+static pmap_t
+pmap_alloc_pmap(void)
+{
+       pmap_t pmap = PMAP_NULL;
+
+       pmap_simple_lock(&pmap_free_list_lock);
+
+       if (pmap_free_list != PMAP_NULL) {
+               pmap = pmap_free_list;
+               pmap_free_list = *((pmap_t *)pmap);
+
+               if (!PMAP_PTR_IS_VALID(pmap)) {
+                       panic("%s: allocated pmap is not valid, pmap=%p",
+                           __FUNCTION__, pmap);
+               }
+       }
+
+       pmap_simple_unlock(&pmap_free_list_lock);
+
+       return pmap;
+}
+
+static void
+pmap_free_pmap(pmap_t pmap)
+{
+       if (!PMAP_PTR_IS_VALID(pmap)) {
+               panic("%s: pmap is not valid, "
+                   "pmap=%p",
+                   __FUNCTION__,
+                   pmap);
+       }
 
-       return KERN_SUCCESS;
+       pmap_simple_lock(&pmap_free_list_lock);
+       *((pmap_t *)pmap) = pmap_free_list;
+       pmap_free_list = pmap;
+       pmap_simple_unlock(&pmap_free_list_lock);
 }
 
+static void
+pmap_bootstrap_pmap_free_list(void)
+{
+       pmap_t cur_head = PMAP_NULL;
+       unsigned long i = 0;
+
+       simple_lock_init(&pmap_free_list_lock, 0);
+
+       for (i = 0; i < pmap_array_count; i++) {
+               *((pmap_t *)(&pmap_array[i])) = cur_head;
+               cur_head = &pmap_array[i];
+       }
+
+       pmap_free_list = cur_head;
+}
+#endif
 
 static void
 pmap_pages_free(
@@ -2009,6 +2975,11 @@ pmap_pages_free(
 
        pmap_simple_unlock(&pmap_pages_lock);
 
+#if XNU_MONITOR
+       (void)size;
+
+       pmap_give_free_ppl_page(pa);
+#else
        vm_page_t       m;
        pmap_paddr_t    pa_max;
 
@@ -2024,6 +2995,7 @@ pmap_pages_free(
                vm_page_unlock_queues();
                vm_object_unlock(pmap_object);
        }
+#endif
 }
 
 static inline void
@@ -2237,7 +3209,20 @@ pv_alloc(
                        pmap_paddr_t    pa;
                        kern_return_t   ret;
 
+#if XNU_MONITOR
+                       /*
+                        * The PPL has no guarantee that its allocation
+                        * will succeed, so steal pages if necessary to
+                        * ensure that we can free up a PV allocation.
+                        */
+                       ret = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
+
+                       if (ret == KERN_RESOURCE_SHORTAGE) {
+                               ret = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_RECLAIM_NOWAIT);
+                       }
+#else
                        ret = pmap_pages_alloc(&pa, PAGE_SIZE, 0);
+#endif
 
                        if (ret != KERN_SUCCESS) {
                                panic("%s: failed to alloc page, ret=%d, "
@@ -2306,6 +3291,7 @@ static inline void
 PV_ALLOC(pv_entry_t **pv_ep)
 {
        assert(*pv_ep == PV_ENTRY_NULL);
+#if !XNU_MONITOR
        if (pv_kern_free_count < pv_kern_low_water_mark) {
                /*
                 * If the kernel reserved pool is low, let non-kernel mappings wait for a page
@@ -2313,6 +3299,7 @@ PV_ALLOC(pv_entry_t **pv_ep)
                 */
                return;
        }
+#endif
        pmap_simple_lock(&pv_free_list_lock);
 
        if ((*pv_ep = pv_free_list) != 0) {
@@ -2396,7 +3383,27 @@ mapping_free_prime(void)
 {
        kern_return_t kr = KERN_FAILURE;
 
+#if XNU_MONITOR
+       unsigned int i = 0;
+
+       /*
+        * Allocate the needed PPL pages up front, to minimize the change that
+        * we will need to call into the PPL multiple times.
+        */
+       for (i = 0; i < PV_ALLOC_INITIAL_TARGET; i += (PAGE_SIZE / sizeof(pv_entry_t))) {
+               pmap_alloc_page_for_ppl();
+       }
+
+       for (i = 0; i < PV_KERN_ALLOC_INITIAL_TARGET; i += (PAGE_SIZE / sizeof(pv_entry_t))) {
+               pmap_alloc_page_for_ppl();
+       }
+
+       while ((kr = mapping_free_prime_ppl()) == KERN_RESOURCE_SHORTAGE) {
+               pmap_alloc_page_for_ppl();
+       }
+#else
        kr = mapping_free_prime_internal();
+#endif
 
        if (kr != KERN_SUCCESS) {
                panic("%s: failed, kr=%d",
@@ -2437,8 +3444,14 @@ mapping_replenish_internal(uint32_t kern_target_count, uint32_t user_target_coun
                pv_cnt = 0;
                pv_eh = pv_et = PV_ENTRY_NULL;
 
+#if XNU_MONITOR
+               if ((ret = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT)) != KERN_SUCCESS) {
+                       return ret;
+               }
+#else
                ret = pmap_pages_alloc(&pa, PAGE_SIZE, 0);
                assert(ret == KERN_SUCCESS);
+#endif
 
                pv_page_count++;
 
@@ -2475,7 +3488,14 @@ mapping_replenish(void)
        current_thread()->options |= TH_OPT_VMPRIV;
 
        for (;;) {
+#if XNU_MONITOR
+
+               while ((kr = mapping_replenish_ppl(pv_kern_low_water_mark, pv_low_water_mark)) == KERN_RESOURCE_SHORTAGE) {
+                       pmap_alloc_page_for_ppl();
+               }
+#else
                kr = mapping_replenish_internal(pv_kern_low_water_mark, pv_low_water_mark);
+#endif
 
                if (kr != KERN_SUCCESS) {
                        panic("%s: failed, kr=%d", __FUNCTION__, kr);
@@ -2847,9 +3867,111 @@ pmap_pte(
 
 #endif
 
+#if __APRR_SUPPORTED__
+/*
+ * Indicates whether the given PTE has special restrictions due to the current
+ * APRR settings.
+ */
+static boolean_t
+is_pte_aprr_protected(pt_entry_t pte)
+{
+       uint64_t aprr_el0_value;
+       uint64_t aprr_el1_value;
+       uint64_t aprr_index;
+
+       MRS(aprr_el0_value, APRR_EL0);
+       MRS(aprr_el1_value, APRR_EL1);
+       aprr_index = PTE_TO_APRR_INDEX(pte);
+
+       /* Check to see if this mapping had APRR restrictions. */
+       if ((APRR_EXTRACT_IDX_ATTR(aprr_el0_value, aprr_index) != APRR_EXTRACT_IDX_ATTR(APRR_EL0_RESET, aprr_index)) ||
+           (APRR_EXTRACT_IDX_ATTR(aprr_el1_value, aprr_index) != APRR_EXTRACT_IDX_ATTR(APRR_EL1_RESET, aprr_index))
+           ) {
+               return TRUE;
+       }
+
+       return FALSE;
+}
+#endif /* __APRR_SUPPORTED__ */
+
 
+#if __APRR_SUPPORTED__
+static boolean_t
+is_pte_xprr_protected(pt_entry_t pte)
+{
+#if __APRR_SUPPORTED__
+       return is_pte_aprr_protected(pte);
+#else /* __APRR_SUPPORTED__ */
+#error "XPRR configuration error"
+#endif /* __APRR_SUPPORTED__ */
+}
+#endif /* __APRR_SUPPORTED__*/
 
+#if __APRR_SUPPORTED__
+static uint64_t
+__unused pte_to_xprr_perm(pt_entry_t pte)
+{
+#if   __APRR_SUPPORTED__
+       switch (PTE_TO_APRR_INDEX(pte)) {
+       case APRR_FIRM_RX_INDEX:  return XPRR_FIRM_RX_PERM;
+       case APRR_FIRM_RO_INDEX:  return XPRR_FIRM_RO_PERM;
+       case APRR_PPL_RW_INDEX:   return XPRR_PPL_RW_PERM;
+       case APRR_KERN_RW_INDEX:  return XPRR_KERN_RW_PERM;
+       case APRR_FIRM_RW_INDEX:  return XPRR_FIRM_RW_PERM;
+       case APRR_KERN0_RW_INDEX: return XPRR_KERN0_RW_PERM;
+       case APRR_USER_JIT_INDEX: return XPRR_USER_JIT_PERM;
+       case APRR_USER_RW_INDEX:  return XPRR_USER_RW_PERM;
+       case APRR_PPL_RX_INDEX:   return XPRR_PPL_RX_PERM;
+       case APRR_KERN_RX_INDEX:  return XPRR_KERN_RX_PERM;
+       case APRR_PPL_RO_INDEX:   return XPRR_PPL_RO_PERM;
+       case APRR_KERN_RO_INDEX:  return XPRR_KERN_RO_PERM;
+       case APRR_KERN0_RX_INDEX: return XPRR_KERN0_RO_PERM;
+       case APRR_KERN0_RO_INDEX: return XPRR_KERN0_RO_PERM;
+       case APRR_USER_RX_INDEX:  return XPRR_USER_RX_PERM;
+       case APRR_USER_RO_INDEX:  return XPRR_USER_RO_PERM;
+       default:                  return XPRR_MAX_PERM;
+       }
+#else
+#error "XPRR configuration error"
+#endif /**/
+}
 
+#if __APRR_SUPPORTED__
+static uint64_t
+xprr_perm_to_aprr_index(uint64_t perm)
+{
+       switch (perm) {
+       case XPRR_FIRM_RX_PERM:  return APRR_FIRM_RX_INDEX;
+       case XPRR_FIRM_RO_PERM:  return APRR_FIRM_RO_INDEX;
+       case XPRR_PPL_RW_PERM:   return APRR_PPL_RW_INDEX;
+       case XPRR_KERN_RW_PERM:  return APRR_KERN_RW_INDEX;
+       case XPRR_FIRM_RW_PERM:  return APRR_FIRM_RW_INDEX;
+       case XPRR_KERN0_RW_PERM: return APRR_KERN0_RW_INDEX;
+       case XPRR_USER_JIT_PERM: return APRR_USER_JIT_INDEX;
+       case XPRR_USER_RW_PERM:  return APRR_USER_RW_INDEX;
+       case XPRR_PPL_RX_PERM:   return APRR_PPL_RX_INDEX;
+       case XPRR_KERN_RX_PERM:  return APRR_KERN_RX_INDEX;
+       case XPRR_PPL_RO_PERM:   return APRR_PPL_RO_INDEX;
+       case XPRR_KERN_RO_PERM:  return APRR_KERN_RO_INDEX;
+       case XPRR_KERN0_RX_PERM: return APRR_KERN0_RO_INDEX;
+       case XPRR_KERN0_RO_PERM: return APRR_KERN0_RO_INDEX;
+       case XPRR_USER_RX_PERM:  return APRR_USER_RX_INDEX;
+       case XPRR_USER_RO_PERM:  return APRR_USER_RO_INDEX;
+       default:                 return APRR_MAX_INDEX;
+       }
+}
+#endif /* __APRR_SUPPORTED__ */
+
+static pt_entry_t
+__unused xprr_perm_to_pte(uint64_t perm)
+{
+#if   __APRR_SUPPORTED__
+       return APRR_INDEX_TO_PTE(xprr_perm_to_aprr_index(perm));
+#else
+#error "XPRR configuration error"
+#endif /**/
+}
+#endif /* __APRR_SUPPORTED__*/
 
 
 /*
@@ -3279,6 +4401,30 @@ pmap_bootstrap(
 
        lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
 
+#if XNU_MONITOR
+
+#if DEVELOPMENT || DEBUG
+       PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
+#endif
+
+       simple_lock_init(&pmap_ppl_free_page_lock, 0);
+
+#if __APRR_SUPPORTED__
+       if (((uintptr_t)(&ppl_trampoline_start)) % PAGE_SIZE) {
+               panic("%s: ppl_trampoline_start is not page aligned, "
+                   "vstart=%#lx",
+                   __FUNCTION__,
+                   vstart);
+       }
+
+       if (((uintptr_t)(&ppl_trampoline_end)) % PAGE_SIZE) {
+               panic("%s: ppl_trampoline_end is not page aligned, "
+                   "vstart=%#lx",
+                   __FUNCTION__,
+                   vstart);
+       }
+#endif /* __APRR_SUPPORTED__ */
+#endif /* XNU_MONITOR */
 
 #if DEVELOPMENT || DEBUG
        if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
@@ -3353,6 +4499,28 @@ pmap_bootstrap(
        pmap_load_io_rgns();
        ptd_bootstrap(ptd_root_table, (unsigned int)(ptd_root_table_size / sizeof(pt_desc_t)));
 
+#if XNU_MONITOR
+       pmap_array_begin = (void *)phystokv(avail_start);
+       pmap_array = pmap_array_begin;
+       avail_start += round_page(MAX_ASID * sizeof(struct pmap));
+       pmap_array_end = (void *)phystokv(avail_start);
+
+       pmap_array_count = ((pmap_array_end - pmap_array_begin) / sizeof(struct pmap));
+
+       pmap_bootstrap_pmap_free_list();
+
+       pmap_ledger_ptr_array_begin = (void *)phystokv(avail_start);
+       pmap_ledger_ptr_array = pmap_ledger_ptr_array_begin;
+       avail_start += round_page(MAX_PMAP_LEDGERS * sizeof(void*));
+       pmap_ledger_ptr_array_end = (void *)phystokv(avail_start);
+
+       pmap_ledger_refcnt_begin = (void *)phystokv(avail_start);
+       pmap_ledger_refcnt = pmap_ledger_refcnt_begin;
+       avail_start += round_page(MAX_PMAP_LEDGERS * sizeof(os_refcnt_t));
+       pmap_ledger_refcnt_end = (void *)phystokv(avail_start);
+
+       simple_lock_init(&pmap_ledger_lock, 0);
+#endif
        pmap_cpu_data_array_init();
 
        vm_first_phys = gPhysBase;
@@ -3430,6 +4598,135 @@ pmap_bootstrap(
 #endif /* KASAN */
 }
 
+#if XNU_MONITOR
+
+static inline void
+pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
+{
+       pmap_paddr_t cur_pa;
+       for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
+               assert(pa_valid(cur_pa));
+               pa_set_monitor(cur_pa);
+       }
+}
+
+static void
+pa_set_range_xprr_perm(pmap_paddr_t start_pa,
+    pmap_paddr_t end_pa,
+    unsigned int expected_perm,
+    unsigned int new_perm)
+{
+       vm_offset_t start_va = phystokv(start_pa);
+       vm_offset_t end_va = start_va + (end_pa - start_pa);
+
+       pa_set_range_monitor(start_pa, end_pa);
+       pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
+}
+
+void
+pmap_static_allocations_done(void)
+{
+       pmap_paddr_t monitor_start_pa;
+       pmap_paddr_t monitor_end_pa;
+
+       /*
+        * We allocate memory for bootstrap starting at topOfKernelData (which
+        * is at the end of the device tree and ramdisk data, if applicable).
+        * We use avail_start as a pointer to the first address that has not
+        * been reserved for bootstrap, so we know which pages to give to the
+        * virtual memory layer.
+        *
+        * These bootstrap allocations will be used primarily for page tables.
+        * If we wish to secure the page tables, we need to start by marking
+        * these bootstrap allocations as pages that we want to protect.
+        */
+       monitor_start_pa = BootArgs->topOfKernelData;
+       monitor_end_pa = BootArgs->topOfKernelData + BOOTSTRAP_TABLE_SIZE;
+
+       /* The bootstrap page tables are mapped RO at boostrap. */
+       pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_PPL_RO_PERM);
+
+       monitor_start_pa = BootArgs->topOfKernelData + BOOTSTRAP_TABLE_SIZE;
+       monitor_end_pa = avail_start;
+
+       /* The other bootstrap allocations are mapped RW at bootstrap. */
+       pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
+
+       /* The RO page tables are mapped RW at bootstrap. */
+       monitor_start_pa = kvtophys((vm_offset_t)&ropagetable_begin);
+       monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
+       pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
+
+       monitor_start_pa = kvtophys(segPPLDATAB);
+       monitor_end_pa = monitor_start_pa + segSizePPLDATA;
+
+       /* PPL data is RW for the PPL, RO for the kernel. */
+       pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
+
+       monitor_start_pa = kvtophys(segPPLTEXTB);
+       monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
+
+       /* PPL text is RX for the PPL, RO for the kernel. */
+       pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
+
+#if __APRR_SUPPORTED__
+       monitor_start_pa = kvtophys(segPPLTRAMPB);
+       monitor_end_pa = monitor_start_pa + segSizePPLTRAMP;
+
+       /*
+        * The PPLTRAMP pages will be a mix of PPL RX/kernel RO and
+        * PPL RX/kernel RX.  However, all of these pages belong to the PPL.
+        */
+       pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
+#endif
+
+       /*
+        * In order to support DTrace, the save areas for the PPL must be
+        * writable.  This is due to the fact that DTrace will try to update
+        * register state.
+        */
+       if (pmap_ppl_disable) {
+               vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
+               vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
+
+               pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
+       }
+
+#if __APRR_SUPPORTED__
+       /* The trampoline must also be specially protected. */
+       pmap_set_range_xprr_perm((vm_offset_t)&ppl_trampoline_start, (vm_offset_t)&ppl_trampoline_end, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
+#endif
+
+       if (segSizePPLDATACONST > 0) {
+               monitor_start_pa = kvtophys(segPPLDATACONSTB);
+               monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
+
+               pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_PPL_RO_PERM);
+       }
+
+       /*
+        * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
+        * precaution.  The real RW mappings are at a different location with guard pages.
+        */
+       pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_PPL_RO_PERM);
+}
+
+
+void
+pmap_lockdown_ppl(void)
+{
+       /* Mark the PPL as being locked down. */
+
+#if __APRR_SUPPORTED__
+       pmap_ppl_locked_down = TRUE;
+       /* Force a trap into to the PPL to update APRR_EL1. */
+       pmap_return(FALSE, FALSE);
+#else
+#error "XPRR configuration error"
+#endif /* __APRR_SUPPORTED__ */
+
+}
+#endif /* XNU_MONITOR */
 
 void
 pmap_virtual_space(
@@ -3644,6 +4941,151 @@ pmap_zone_init(
            PAGE_SIZE, "pmap");
 }
 
+#if XNU_MONITOR
+MARK_AS_PMAP_TEXT static void
+pmap_ledger_alloc_init_internal(size_t size)
+{
+       pmap_simple_lock(&pmap_ledger_lock);
+
+       if (pmap_ledger_alloc_initialized) {
+               panic("%s: already initialized, "
+                   "size=%lu",
+                   __func__,
+                   size);
+       }
+
+       if (size != sizeof(pmap_ledger_data_t)) {
+               panic("%s: size mismatch, expected %lu, "
+                   "size=%lu",
+                   __func__, PMAP_LEDGER_DATA_BYTES,
+                   size);
+       }
+
+       pmap_ledger_alloc_initialized = true;
+
+       pmap_simple_unlock(&pmap_ledger_lock);
+}
+
+MARK_AS_PMAP_TEXT static ledger_t
+pmap_ledger_alloc_internal(void)
+{
+       pmap_paddr_t paddr;
+       uint64_t vaddr, vstart, vend;
+       uint64_t index;
+
+       ledger_t new_ledger;
+       uint64_t array_index;
+
+       pmap_simple_lock(&pmap_ledger_lock);
+       if (pmap_ledger_free_list == NULL) {
+               paddr = pmap_get_free_ppl_page();
+
+               if (paddr == 0) {
+                       pmap_simple_unlock(&pmap_ledger_lock);
+                       return NULL;
+               }
+
+               vstart = phystokv(paddr);
+               vend = vstart + PAGE_SIZE;
+
+               for (vaddr = vstart; (vaddr < vend) && ((vaddr + sizeof(pmap_ledger_t)) <= vend); vaddr += sizeof(pmap_ledger_t)) {
+                       pmap_ledger_t *free_ledger;
+
+                       index = pmap_ledger_ptr_array_free_index++;
+
+                       if (index >= MAX_PMAP_LEDGERS) {
+                               panic("%s: pmap_ledger_ptr_array is full, index=%llu",
+                                   __func__, index);
+                       }
+
+                       free_ledger = (pmap_ledger_t*)vaddr;
+
+                       pmap_ledger_ptr_array[index] = free_ledger;
+                       free_ledger->back_ptr = &pmap_ledger_ptr_array[index];
+
+                       free_ledger->next = pmap_ledger_free_list;
+                       pmap_ledger_free_list = free_ledger;
+               }
+
+               pa_set_range_xprr_perm(paddr, paddr + PAGE_SIZE, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
+       }
+
+       new_ledger = (ledger_t)pmap_ledger_free_list;
+       pmap_ledger_free_list = pmap_ledger_free_list->next;
+
+       array_index = pmap_ledger_validate(new_ledger);
+       os_ref_init(&pmap_ledger_refcnt[array_index], NULL);
+
+       pmap_simple_unlock(&pmap_ledger_lock);
+
+       return new_ledger;
+}
+
+MARK_AS_PMAP_TEXT static void
+pmap_ledger_free_internal(ledger_t ledger)
+{
+       pmap_ledger_t* free_ledger;
+
+       free_ledger = (pmap_ledger_t*)ledger;
+
+       pmap_simple_lock(&pmap_ledger_lock);
+       uint64_t array_index = pmap_ledger_validate(ledger);
+
+       if (os_ref_release(&pmap_ledger_refcnt[array_index]) != 0) {
+               panic("%s: ledger still referenced, "
+                   "ledger=%p",
+                   __func__,
+                   ledger);
+       }
+
+       free_ledger->next = pmap_ledger_free_list;
+       pmap_ledger_free_list = free_ledger;
+       pmap_simple_unlock(&pmap_ledger_lock);
+}
+
+
+static void
+pmap_ledger_retain(ledger_t ledger)
+{
+       pmap_simple_lock(&pmap_ledger_lock);
+       uint64_t array_index = pmap_ledger_validate(ledger);
+       os_ref_retain(&pmap_ledger_refcnt[array_index]);
+       pmap_simple_unlock(&pmap_ledger_lock);
+}
+
+static void
+pmap_ledger_release(ledger_t ledger)
+{
+       pmap_simple_lock(&pmap_ledger_lock);
+       uint64_t array_index = pmap_ledger_validate(ledger);
+       os_ref_release_live(&pmap_ledger_refcnt[array_index]);
+       pmap_simple_unlock(&pmap_ledger_lock);
+}
+
+void
+pmap_ledger_alloc_init(size_t size)
+{
+       pmap_ledger_alloc_init_ppl(size);
+}
+
+ledger_t
+pmap_ledger_alloc(void)
+{
+       ledger_t retval = NULL;
+
+       while ((retval = pmap_ledger_alloc_ppl()) == NULL) {
+               pmap_alloc_page_for_ppl();
+       }
+
+       return retval;
+}
+
+void
+pmap_ledger_free(ledger_t ledger)
+{
+       pmap_ledger_free_ppl(ledger);
+}
+#else /* XNU_MONITOR */
 __dead2
 void
 pmap_ledger_alloc_init(size_t size)
@@ -3669,6 +5111,7 @@ pmap_ledger_free(ledger_t ledger)
            "ledger=%p",
            __func__, ledger);
 }
+#endif /* XNU_MONITOR */
 
 /*
  *     Create and return a physical map.
@@ -3703,6 +5146,11 @@ pmap_create_options_internal(
                return PMAP_NULL;
        }
 
+#if XNU_MONITOR
+       if ((p = pmap_alloc_pmap()) == PMAP_NULL) {
+               return PMAP_NULL;
+       }
+#else
        /*
         *      Allocate a pmap struct from the pmap_zone.  Then allocate
         *      the translation table of the right size for the pmap.
@@ -3710,6 +5158,7 @@ pmap_create_options_internal(
        if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
                return PMAP_NULL;
        }
+#endif
 
        if (flags & PMAP_CREATE_64BIT) {
                p->min = MACH_VM_MIN_ADDRESS;
@@ -3743,6 +5192,12 @@ pmap_create_options_internal(
        }
 
 
+#if XNU_MONITOR
+       if (ledger) {
+               pmap_ledger_validate(ledger);
+               pmap_ledger_retain(ledger);
+       }
+#endif /* XNU_MONITOR */
 
        p->ledger = ledger;
 
@@ -3756,7 +5211,11 @@ pmap_create_options_internal(
        p->tte_index_max = tte_index_max;
 #endif
 
+#if XNU_MONITOR
+       p->tte = pmap_tt1_allocate(p, PMAP_ROOT_ALLOC_SIZE, PMAP_TT_ALLOCATE_NOWAIT);
+#else
        p->tte = pmap_tt1_allocate(p, PMAP_ROOT_ALLOC_SIZE, 0);
+#endif
        if (!(p->tte)) {
                goto tt1_alloc_fail;
        }
@@ -3803,7 +5262,15 @@ pmap_create_options_internal(
 tt1_alloc_fail:
        pmap_get_pt_ops(p)->free_id(p);
 id_alloc_fail:
+#if XNU_MONITOR
+       pmap_free_pmap(p);
+
+       if (ledger) {
+               pmap_ledger_release(ledger);
+       }
+#else
        zfree(pmap_zone, p);
+#endif
        return PMAP_NULL;
 }
 
@@ -3819,7 +5286,17 @@ pmap_create_options(
 
        ledger_reference(ledger);
 
+#if XNU_MONITOR
+       /*
+        * TODO: It should be valid for pmap_create_options_internal to fail; we could
+        * be out of ASIDs.
+        */
+       while ((pmap = pmap_create_options_ppl(ledger, size, flags)) == PMAP_NULL) {
+               pmap_alloc_page_for_ppl();
+       }
+#else
        pmap = pmap_create_options_internal(ledger, size, flags);
+#endif
 
        if (pmap == PMAP_NULL) {
                ledger_dereference(ledger);
@@ -3830,7 +5307,13 @@ pmap_create_options(
        return pmap;
 }
 
-#if MACH_ASSERT
+#if XNU_MONITOR
+/*
+ * This symbol remains in place when the PPL is enabled so that the dispatch
+ * table does not change from development to release configurations.
+ */
+#endif
+#if MACH_ASSERT || XNU_MONITOR
 MARK_AS_PMAP_TEXT static void
 pmap_set_process_internal(
        __unused pmap_t pmap,
@@ -3874,7 +5357,7 @@ pmap_set_process_internal(
        }
 #endif /* MACH_ASSERT */
 }
-#endif /* MACH_ASSERT*/
+#endif /* MACH_ASSERT || XNU_MONITOR */
 
 #if MACH_ASSERT
 void
@@ -3883,7 +5366,11 @@ pmap_set_process(
        int pid,
        char *procname)
 {
+#if XNU_MONITOR
+       pmap_set_process_ppl(pmap, pid, procname);
+#else
        pmap_set_process_internal(pmap, pid, procname);
+#endif
 }
 #endif /* MACH_ASSERT */
 
@@ -3999,10 +5486,22 @@ pmap_destroy_internal(
        pmap_check_ledgers(pmap);
 
        if (pmap->nested_region_asid_bitmap) {
+#if XNU_MONITOR
+               pmap_pages_free(kvtophys((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE);
+#else
                kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
+#endif
        }
 
+#if XNU_MONITOR
+       if (pmap->ledger) {
+               pmap_ledger_release(pmap->ledger);
+       }
+
+       pmap_free_pmap(pmap);
+#else
        zfree(pmap_zone, pmap);
+#endif
 }
 
 void
@@ -4015,7 +5514,13 @@ pmap_destroy(
 
        ledger = pmap->ledger;
 
+#if XNU_MONITOR
+       pmap_destroy_ppl(pmap);
+
+       pmap_check_ledger_fields(ledger);
+#else
        pmap_destroy_internal(pmap);
+#endif
 
        ledger_dereference(ledger);
 
@@ -4040,7 +5545,11 @@ void
 pmap_reference(
        pmap_t pmap)
 {
+#if XNU_MONITOR
+       pmap_reference_ppl(pmap);
+#else
        pmap_reference_internal(pmap);
+#endif
 }
 
 static tt_entry_t *
@@ -4084,6 +5593,9 @@ pmap_tt1_allocate(
                return (tt_entry_t *)0;
        }
 
+#if XNU_MONITOR
+       assert(pa);
+#endif
 
        if (size < PAGE_SIZE) {
                va = phystokv(pa) + size;
@@ -4263,6 +5775,9 @@ pmap_tt_allocate(
                *ttp = (tt_entry_t *)phystokv(pa);
        }
 
+#if XNU_MONITOR
+       assert(*ttp);
+#endif
 
        return KERN_SUCCESS;
 }
@@ -4525,6 +6040,11 @@ pmap_remove_pv(
        pv_h = pai_to_pvh(pai);
        vm_offset_t pvh_flags = pvh_get_flags(pv_h);
 
+#if XNU_MONITOR
+       if (pvh_flags & PVH_FLAG_LOCKDOWN) {
+               panic("%d is locked down (%#lx), cannot remove", pai, pvh_flags);
+       }
+#endif
 
        if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
                if (__builtin_expect((cpte != pvh_ptep(pv_h)), 0)) {
@@ -4685,6 +6205,14 @@ pmap_remove_range_options(
                        //assert(!ARM_PTE_IS_COMPRESSED(spte));
                        pa = pte_to_pa(spte);
                        if (!pa_valid(pa)) {
+#if XNU_MONITOR || HAS_MILD_DSB
+                               unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
+#endif
+#if XNU_MONITOR
+                               if (!pmap_ppl_disable && (cacheattr & PP_ATTR_MONITOR)) {
+                                       panic("%s: attempt to remove mapping of PPL-protected I/O address 0x%llx", __func__, (uint64_t)pa);
+                               }
+#endif
                                break;
                        }
                        pai = (int)pa_index(pa);
@@ -4985,7 +6513,13 @@ pmap_remove_options(
                        l = end;
                }
 
+#if XNU_MONITOR
+               remove_count += pmap_remove_options_ppl(pmap, va, l, options);
+
+               pmap_ledger_check_balance(pmap);
+#else
                remove_count += pmap_remove_options_internal(pmap, va, l, options);
+#endif
 
                va = l;
        }
@@ -5099,7 +6633,11 @@ pmap_switch(
        pmap_t pmap)
 {
        PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
+#if XNU_MONITOR
+       pmap_switch_ppl(pmap);
+#else
        pmap_switch_internal(pmap);
+#endif
        PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
 }
 
@@ -5167,6 +6705,11 @@ pmap_page_protect_options_internal(
        pv_h = pai_to_pvh(pai);
        pvh_flags = pvh_get_flags(pv_h);
 
+#if XNU_MONITOR
+       if (remove && (pvh_flags & PVH_FLAG_LOCKDOWN)) {
+               panic("%d is locked down (%#llx), cannot remove", pai, pvh_get_flags(pv_h));
+       }
+#endif
 
        pte_p = PT_ENTRY_NULL;
        pve_p = PV_ENTRY_NULL;
@@ -5194,6 +6737,12 @@ pmap_page_protect_options_internal(
 
 #ifdef PVH_FLAG_IOMMU
                if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU) {
+#if XNU_MONITOR
+                       if (pvh_flags & PVH_FLAG_LOCKDOWN) {
+                               panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu 0x%llx, pve_p=%p",
+                                   ppnum, (uint64_t)pte_p & ~PVH_FLAG_IOMMU, pve_p);
+                       }
+#endif
                        if (remove) {
                                if (options & PMAP_OPTIONS_COMPRESSOR) {
                                        panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu 0x%llx, pve_p=%p",
@@ -5397,6 +6946,17 @@ pmap_page_protect_options_internal(
                                tmplate |= pt_attr_leaf_xn(pt_attr);
                        }
 
+#if __APRR_SUPPORTED__
+                       if (__improbable(is_pte_xprr_protected(spte))) {
+                               panic("pmap_page_protect: modifying an xPRR mapping pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx ppnum: 0x%x",
+                                   pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)spte, (uint64_t)tmplate, (uint64_t)va, ppnum);
+                       }
+
+                       if (__improbable(is_pte_xprr_protected(tmplate))) {
+                               panic("pmap_page_protect: creating an xPRR mapping pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx ppnum: 0x%x",
+                                   pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)spte, (uint64_t)tmplate, (uint64_t)va, ppnum);
+                       }
+#endif /* __APRR_SUPPORTED__*/
 
                        if (*pte_p != ARM_PTE_TYPE_FAULT &&
                            !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p) &&
@@ -5480,7 +7040,11 @@ pmap_page_protect_options(
 
        PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
 
+#if XNU_MONITOR
+       pmap_page_protect_options_ppl(ppnum, prot, options);
+#else
        pmap_page_protect_options_internal(ppnum, prot, options);
+#endif
 
        PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
 }
@@ -5729,6 +7293,18 @@ pmap_protect_options_internal(
                        /* We do not expect to write fast fault the entry. */
                        pte_set_was_writeable(tmplate, false);
 
+#if __APRR_SUPPORTED__
+                       if (__improbable(is_pte_xprr_protected(spte) && (pte_to_xprr_perm(spte) != XPRR_USER_JIT_PERM))) {
+                               /* Only test for PPL protection here,  User-JIT mappings may be mutated by this function. */
+                               panic("%s: modifying a PPL mapping pte_p=%p pmap=%p prot=%d options=%u, pte=0x%llx, tmplate=0x%llx",
+                                   __func__, pte_p, pmap, prot, options, (uint64_t)spte, (uint64_t)tmplate);
+                       }
+
+                       if (__improbable(is_pte_xprr_protected(tmplate))) {
+                               panic("%s: creating an xPRR mapping pte_p=%p pmap=%p prot=%d options=%u, pte=0x%llx, tmplate=0x%llx",
+                                   __func__, pte_p, pmap, prot, options, (uint64_t)spte, (uint64_t)tmplate);
+                       }
+#endif /* __APRR_SUPPORTED__*/
                        WRITE_PTE_FAST(pte_p, tmplate);
 
                        if (managed) {
@@ -5798,7 +7374,11 @@ pmap_protect_options(
                        l = e;
                }
 
+#if XNU_MONITOR
+               pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
+#else
                pmap_protect_options_internal(pmap, beg, l, prot, options, args);
+#endif
 
                beg = l;
        }
@@ -5979,6 +7559,11 @@ pmap_enter_pv(
 
        vm_offset_t pvh_flags = pvh_get_flags(pv_h);
 
+#if XNU_MONITOR
+       if (pvh_flags & PVH_FLAG_LOCKDOWN) {
+               panic("%d is locked down (%#lx), cannot enter", pai, pvh_flags);
+       }
+#endif
 
 #ifdef PVH_FLAG_CPU
        /* An IOMMU mapping may already be present for a page that hasn't yet
@@ -6384,6 +7969,22 @@ Pmap_enter_loop:
                pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
                pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits);
 
+#if XNU_MONITOR
+               /* The regular old kernel is not allowed to remap PPL pages. */
+               if (pa_test_monitor(pa)) {
+                       panic("%s: page belongs to PPL, "
+                           "pmap=%p, v=0x%llx, pn=%u, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
+                           __FUNCTION__,
+                           pmap, v, pn, prot, fault_type, flags, wired, options);
+               }
+
+               if (pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN) {
+                       panic("%s: page locked down, "
+                           "pmap=%p, v=0x%llx, pn=%u, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
+                           __FUNCTION__,
+                           pmap, v, pn, prot, fault_type, flags, wired, options);
+               }
+#endif
 
 
                if (pte == *pte_p) {
@@ -6483,6 +8084,22 @@ Pmap_enter_loop:
 
                pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits);
 
+#if XNU_MONITOR
+               if (!pmap_ppl_disable && (wimg_bits & PP_ATTR_MONITOR)) {
+                       uint64_t xprr_perm = pte_to_xprr_perm(pte);
+                       pte &= ~ARM_PTE_XPRR_MASK;
+                       switch (xprr_perm) {
+                       case XPRR_KERN_RO_PERM:
+                               pte |= xprr_perm_to_pte(XPRR_PPL_RO_PERM);
+                               break;
+                       case XPRR_KERN_RW_PERM:
+                               pte |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
+                               break;
+                       default:
+                               panic("Unsupported xPRR perm %llu for pte 0x%llx", xprr_perm, (uint64_t)pte);
+                       }
+               }
+#endif
                pmap_enter_pte(pmap, pte_p, pte, v);
        }
 
@@ -6538,7 +8155,25 @@ pmap_enter_options(
        PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
            VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pn, prot);
 
+#if XNU_MONITOR
+       if (options & PMAP_OPTIONS_NOWAIT) {
+               /* If NOWAIT was requested, just return the result. */
+               kr = pmap_enter_options_ppl(pmap, v, pn, prot, fault_type, flags, wired, options);
+       } else {
+               /*
+                * If NOWAIT was not requested, loop until the enter does not
+                * fail due to lack of resources.
+                */
+               while ((kr = pmap_enter_options_ppl(pmap, v, pn, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT)) == KERN_RESOURCE_SHORTAGE) {
+                       pv_water_mark_check();
+                       pmap_alloc_page_for_ppl();
+               }
+       }
+
+       pmap_ledger_check_balance(pmap);
+#else
        kr = pmap_enter_options_internal(pmap, v, pn, prot, fault_type, flags, wired, options);
+#endif
        pv_water_mark_check();
 
        PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
@@ -6614,7 +8249,13 @@ pmap_change_wiring(
        vm_map_address_t v,
        boolean_t wired)
 {
+#if XNU_MONITOR
+       pmap_change_wiring_ppl(pmap, v, wired);
+
+       pmap_ledger_check_balance(pmap);
+#else
        pmap_change_wiring_internal(pmap, v, wired);
+#endif
 }
 
 MARK_AS_PMAP_TEXT static ppnum_t
@@ -6657,7 +8298,11 @@ pmap_find_phys(
        }
 
        if (not_in_kdp) {
+#if XNU_MONITOR
+               return pmap_find_phys_ppl(pmap, va);
+#else
                return pmap_find_phys_internal(pmap, va);
+#endif
        } else {
                return pmap_vtophys(pmap, va);
        }
@@ -6804,7 +8449,11 @@ pmap_extract(
                return pa;
        }
 
+#if XNU_MONITOR
+       return pmap_extract_ppl(pmap, va);
+#else
        return pmap_extract_internal(pmap, va);
+#endif
 }
 
 /*
@@ -6986,7 +8635,14 @@ pmap_expand(
                                if (options & PMAP_OPTIONS_NOWAIT) {
                                        return KERN_RESOURCE_SHORTAGE;
                                }
+#if XNU_MONITOR
+                               panic("%s: failed to allocate tt, "
+                                   "pmap=%p, v=%p, options=0x%x, level=%u",
+                                   __FUNCTION__,
+                                   pmap, (void *)v, options, level);
+#else
                                VM_PAGE_WAIT();
+#endif
                        }
                        PMAP_LOCK(pmap);
                        if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
@@ -7051,6 +8707,13 @@ void
 pmap_gc(
        void)
 {
+#if XNU_MONITOR
+       /*
+        * We cannot invoke the scheduler from the PPL, so for now we elide the
+        * GC logic if the PPL is enabled.
+        */
+#endif
+#if !XNU_MONITOR
        pmap_t  pmap, pmap_next;
        boolean_t       gc_wait;
 
@@ -7085,6 +8748,7 @@ pmap_gc(
                }
                pmap_simple_unlock(&pmaps_lock);
        }
+#endif
 }
 
 /*
@@ -7093,7 +8757,11 @@ pmap_gc(
 uint64_t
 pmap_release_pages_fast(void)
 {
+#if XNU_MONITOR
+       return pmap_release_ppl_pages_to_kernel();
+#else /* XNU_MONITOR */
        return 0;
+#endif
 }
 
 /*
@@ -7227,6 +8895,14 @@ phys_attribute_clear_internal(
        pmap_paddr_t    pa = ptoa(pn);
        vm_prot_t       allow_mode = VM_PROT_ALL;
 
+#if XNU_MONITOR
+       if (bits & PP_ATTR_PPL_OWNED_BITS) {
+               panic("%s: illegal request, "
+                   "pn=%u, bits=%#x, options=%#x, arg=%p",
+                   __FUNCTION__,
+                   pn, bits, options, arg);
+       }
+#endif
 
        if ((bits & PP_ATTR_MODIFIED) &&
            (options & PMAP_OPTIONS_NOFLUSH) &&
@@ -7288,7 +8964,11 @@ phys_attribute_clear(
         */
        PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
 
+#if XNU_MONITOR
+       phys_attribute_clear_ppl(pn, bits, options, arg);
+#else
        phys_attribute_clear_internal(pn, bits, options, arg);
+#endif
 
        PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
 }
@@ -7308,6 +8988,14 @@ phys_attribute_set_internal(
        pmap_paddr_t    pa = ptoa(pn);
        assert(pn != vm_page_fictitious_addr);
 
+#if XNU_MONITOR
+       if (bits & PP_ATTR_PPL_OWNED_BITS) {
+               panic("%s: illegal request, "
+                   "pn=%u, bits=%#x",
+                   __FUNCTION__,
+                   pn, bits);
+       }
+#endif
 
        pa_set_bits(pa, bits);
 
@@ -7319,7 +9007,11 @@ phys_attribute_set(
        ppnum_t pn,
        unsigned int bits)
 {
+#if XNU_MONITOR
+       phys_attribute_set_ppl(pn, bits);
+#else
        phys_attribute_set_internal(pn, bits);
+#endif
 }
 
 
@@ -7572,10 +9264,19 @@ pmap_clear_noencrypt(
 #endif
 }
 
+#if XNU_MONITOR
+boolean_t
+pmap_is_monitor(ppnum_t pn)
+{
+       assert(pa_valid(ptoa(pn)));
+       return phys_attribute_test(pn, PP_ATTR_MONITOR);
+}
+#endif
 
 void
 pmap_lock_phys_page(ppnum_t pn)
 {
+#if !XNU_MONITOR
        int             pai;
        pmap_paddr_t    phys = ptoa(pn);
 
@@ -7583,6 +9284,9 @@ pmap_lock_phys_page(ppnum_t pn)
                pai = (int)pa_index(phys);
                LOCK_PVH(pai);
        } else
+#else
+       (void)pn;
+#endif
        { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
 }
 
@@ -7590,6 +9294,7 @@ pmap_lock_phys_page(ppnum_t pn)
 void
 pmap_unlock_phys_page(ppnum_t pn)
 {
+#if !XNU_MONITOR
        int             pai;
        pmap_paddr_t    phys = ptoa(pn);
 
@@ -7597,6 +9302,9 @@ pmap_unlock_phys_page(ppnum_t pn)
                pai = (int)pa_index(phys);
                UNLOCK_PVH(pai);
        } else
+#else
+       (void)pn;
+#endif
        { simple_unlock(&phys_backup_lock);}
 }
 
@@ -7683,7 +9391,11 @@ pmap_switch_user_ttb(
        pmap_t pmap)
 {
        PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
+#if XNU_MONITOR
+       pmap_switch_user_ttb_ppl(pmap);
+#else
        pmap_switch_user_ttb_internal(pmap);
+#endif
        PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_END);
 }
 
@@ -7700,7 +9412,11 @@ pmap_clear_user_ttb_internal(void)
 void
 pmap_clear_user_ttb(void)
 {
+#if XNU_MONITOR
+       pmap_clear_user_ttb_ppl();
+#else
        pmap_clear_user_ttb_internal();
+#endif
 }
 
 /*
@@ -7817,6 +9533,16 @@ arm_force_fast_fault_internal(
                        }
                }
 
+#if MACH_ASSERT && XNU_MONITOR
+               if (is_pte_xprr_protected(spte)) {
+                       if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
+                               panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
+                                   "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
+                                   __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
+                                   ppnum, options, allow_mode);
+                       }
+               }
+#endif /* MACH_ASSERT && XNU_MONITOR */
 
                if (update_pte) {
                        if (*pte_p != ARM_PTE_TYPE_FAULT &&
@@ -7928,7 +9654,11 @@ arm_force_fast_fault(
                return FALSE;   /* Not a managed page. */
        }
 
+#if XNU_MONITOR
+       return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
+#else
        return arm_force_fast_fault_internal(ppnum, allow_mode, options);
+#endif
 }
 
 /*
@@ -8021,6 +9751,16 @@ arm_clear_fast_fault(
                        }
                }
 
+#if MACH_ASSERT && XNU_MONITOR
+               if (is_pte_xprr_protected(spte)) {
+                       if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
+                               panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
+                                   "ppnum=0x%x, fault_type=0x%x",
+                                   __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
+                                   ppnum, fault_type);
+                       }
+               }
+#endif /* MACH_ASSERT && XNU_MONITOR */
 
                if (spte != tmplate) {
                        if (spte != ARM_PTE_TYPE_FAULT) {
@@ -8099,17 +9839,51 @@ arm_fast_fault_internal(
 
                        if (!pa_valid(pa)) {
                                PMAP_UNLOCK(pmap);
+#if XNU_MONITOR
+                               if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
+                                       return KERN_PROTECTION_FAILURE;
+                               } else
+#endif
                                return result;
                        }
                        pai = (int)pa_index(pa);
                        LOCK_PVH(pai);
+#if __APRR_SUPPORTED__
+                       if (*ptep == spte) {
+                               /*
+                                * Double-check the spte value, as we care
+                                * about the AF bit.
+                                */
+                               break;
+                       }
+                       UNLOCK_PVH(pai);
+#else /* !(__APRR_SUPPORTED__*/
                        break;
+#endif /* !(__APRR_SUPPORTED__*/
                }
        } else {
                PMAP_UNLOCK(pmap);
                return result;
        }
 
+#if __APRR_SUPPORTED__
+       /* Check to see if this mapping had APRR restrictions. */
+       if (is_pte_xprr_protected(spte)) {
+               /*
+                * We have faulted on an XPRR managed mapping; decide if the access should be
+                * reattempted or if it should cause an exception. Now that all JIT entitled
+                * task threads always have MPRR enabled we're only here because of
+                * an AF fault or an actual permission fault. AF faults will have result
+                * changed to KERN_SUCCESS below upon arm_clear_fast_fault return.
+                */
+               if (was_af_fault && (spte & ARM_PTE_AF)) {
+                       result = KERN_SUCCESS;
+                       goto out;
+               } else {
+                       result = KERN_PROTECTION_FAILURE;
+               }
+       }
+#endif /* __APRR_SUPPORTED__*/
 
        if ((IS_REFFAULT_PAGE(pai)) ||
            ((fault_type & VM_PROT_WRITE) && IS_MODFAULT_PAGE(pai))) {
@@ -8140,6 +9914,9 @@ arm_fast_fault_internal(
                }
        }
 
+#if __APRR_SUPPORTED__
+out:
+#endif /* __APRR_SUPPORTED__*/
        UNLOCK_PVH(pai);
        PMAP_UNLOCK(pmap);
        return result;
@@ -8182,7 +9959,11 @@ arm_fast_fault(
        }
 #endif
 
+#if XNU_MONITOR
+       result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
+#else
        result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
+#endif
 
 #if (__ARM_VMSA__ == 7)
 done:
@@ -8304,7 +10085,27 @@ pmap_map_cpu_windows_copy_internal(
        vm_offset_t     cpu_copywindow_vaddr = 0;
        bool            need_strong_sync = false;
 
+#if XNU_MONITOR || HAS_MILD_DSB
+       unsigned int    cacheattr = (!pa_valid(ptoa(pn)) ? pmap_cache_attributes(pn) : 0);
+       need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
+#endif
+
+#if XNU_MONITOR
+#ifdef  __ARM_COHERENT_IO__
+       if (pa_valid(ptoa(pn)) && !pmap_ppl_disable) {
+               panic("%s: attempted to map a managed page, "
+                   "pn=%u, prot=0x%x, wimg_bits=0x%x",
+                   __FUNCTION__,
+                   pn, prot, wimg_bits);
+       }
+       if (!pmap_ppl_disable && (cacheattr & PP_ATTR_MONITOR)) {
+               panic("%s: attempt to map PPL-protected I/O address 0x%llx", __func__, (uint64_t)ptoa(pn));
+       }
 
+#else /* __ARM_COHERENT_IO__ */
+#error CPU copy windows are not properly supported with both the PPL and incoherent IO
+#endif /* __ARM_COHERENT_IO__ */
+#endif /* XNU_MONITOR */
        cpu_num = pmap_cpu_data->cpu_number;
 
        for (i = 0; i < CPUWINDOWS_MAX; i++) {
@@ -8350,7 +10151,11 @@ pmap_map_cpu_windows_copy(
        vm_prot_t prot,
        unsigned int wimg_bits)
 {
+#if XNU_MONITOR
+       return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
+#else
        return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
+#endif
 }
 
 MARK_AS_PMAP_TEXT static void
@@ -8378,7 +10183,11 @@ void
 pmap_unmap_cpu_windows_copy(
        unsigned int index)
 {
+#if XNU_MONITOR
+       return pmap_unmap_cpu_windows_copy_ppl(index);
+#else
        return pmap_unmap_cpu_windows_copy_internal(index);
+#endif
 }
 
 /*
@@ -8398,7 +10207,11 @@ void
 pmap_set_nested(
        pmap_t pmap)
 {
+#if XNU_MONITOR
+       pmap_set_nested_ppl(pmap);
+#else
        pmap_set_nested_internal(pmap);
+#endif
 }
 
 /*
@@ -8727,9 +10540,72 @@ pmap_trim(
        addr64_t nstart,
        uint64_t size)
 {
+#if XNU_MONITOR
+       pmap_trim_ppl(grand, subord, vstart, nstart, size);
+
+       pmap_ledger_check_balance(grand);
+       pmap_ledger_check_balance(subord);
+#else
        pmap_trim_internal(grand, subord, vstart, nstart, size);
+#endif
 }
 
+#if HAS_APPLE_PAC && XNU_MONITOR
+static void *
+pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator)
+{
+       void *res = NULL;
+       boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
+
+       ml_set_kernelkey_enabled(FALSE);
+       switch (key) {
+       case ptrauth_key_asia:
+               res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
+               break;
+       case ptrauth_key_asda:
+               res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
+               break;
+       default:
+               panic("attempt to sign user pointer without process independent key");
+       }
+       ml_set_kernelkey_enabled(TRUE);
+
+       ml_set_interrupts_enabled(current_intr_state);
+
+       return res;
+}
+
+void *
+pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator)
+{
+       return pmap_sign_user_ptr_internal(value, key, discriminator);
+}
+
+static void *
+pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator)
+{
+       if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
+               panic("attempt to auth user pointer without process independent key");
+       }
+
+       void *res = NULL;
+       boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
+
+       ml_set_kernelkey_enabled(FALSE);
+       res = ml_auth_ptr_unchecked(value, key, discriminator);
+       ml_set_kernelkey_enabled(TRUE);
+
+       ml_set_interrupts_enabled(current_intr_state);
+
+       return res;
+}
+
+void *
+pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator)
+{
+       return pmap_auth_user_ptr_internal(value, key, discriminator);
+}
+#endif /* HAS_APPLE_PAC && XNU_MONITOR */
 
 /*
  *     kern_return_t pmap_nest(grand, subord, vstart, size)
@@ -8776,6 +10652,9 @@ pmap_nest_internal(
        __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
        assert(pmap_get_pt_attr(subord) == pt_attr);
 
+#if XNU_MONITOR
+       expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
+#endif
 
        if (((size | vstart | nstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL) {
                panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx\n", grand, vstart, nstart, size);
@@ -8792,7 +10671,29 @@ pmap_nest_internal(
        if (subord->nested_region_asid_bitmap == NULL) {
                nested_region_asid_bitmap_size  = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
 
+#if XNU_MONITOR
+               pmap_paddr_t pa = 0;
+
+               if ((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE) {
+                       panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, "
+                           "grand=%p, subord=%p, vstart=0x%llx, nstart=0x%llx, size=%llx",
+                           __FUNCTION__,
+                           nested_region_asid_bitmap_size,
+                           grand, subord, vstart, nstart, size);
+               }
+
+               kr = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
+
+               if (kr != KERN_SUCCESS) {
+                       return kr;
+               }
+
+               assert(pa);
+
+               nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
+#else
                nested_region_asid_bitmap = kalloc(nested_region_asid_bitmap_size * sizeof(unsigned int));
+#endif
                bzero(nested_region_asid_bitmap, nested_region_asid_bitmap_size * sizeof(unsigned int));
 
                PMAP_LOCK(subord);
@@ -8805,7 +10706,11 @@ pmap_nest_internal(
                }
                PMAP_UNLOCK(subord);
                if (nested_region_asid_bitmap != NULL) {
+#if XNU_MONITOR
+                       pmap_pages_free(kvtophys((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
+#else
                        kfree(nested_region_asid_bitmap, nested_region_asid_bitmap_size * sizeof(unsigned int));
+#endif
                }
        }
        if ((subord->nested_region_subord_addr + subord->nested_region_size) < nend) {
@@ -8820,7 +10725,29 @@ pmap_nest_internal(
                /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
                new_nested_region_asid_bitmap_size  = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
 
+#if XNU_MONITOR
+               pmap_paddr_t pa = 0;
+
+               if ((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE) {
+                       panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, "
+                           "grand=%p, subord=%p, vstart=0x%llx, nstart=0x%llx, size=%llx",
+                           __FUNCTION__,
+                           new_nested_region_asid_bitmap_size,
+                           grand, subord, vstart, nstart, size);
+               }
+
+               kr = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
+
+               if (kr != KERN_SUCCESS) {
+                       return kr;
+               }
+
+               assert(pa);
+
+               new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
+#else
                new_nested_region_asid_bitmap = kalloc(new_nested_region_asid_bitmap_size * sizeof(unsigned int));
+#endif
                PMAP_LOCK(subord);
                if (subord->nested_region_size < new_size) {
                        bzero(new_nested_region_asid_bitmap, new_nested_region_asid_bitmap_size * sizeof(unsigned int));
@@ -8834,9 +10761,17 @@ pmap_nest_internal(
                }
                PMAP_UNLOCK(subord);
                if (nested_region_asid_bitmap != NULL)
+#if XNU_MONITOR
+               {pmap_pages_free(kvtophys((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);}
+#else
                { kfree(nested_region_asid_bitmap, nested_region_asid_bitmap_size * sizeof(unsigned int));}
+#endif
                if (new_nested_region_asid_bitmap != NULL)
+#if XNU_MONITOR
+               {pmap_pages_free(kvtophys((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);}
+#else
                { kfree(new_nested_region_asid_bitmap, new_nested_region_asid_bitmap_size * sizeof(unsigned int));}
+#endif
        }
 
        PMAP_LOCK(subord);
@@ -9016,7 +10951,16 @@ pmap_nest(
            VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
            VM_KERNEL_ADDRHIDE(vstart));
 
+#if XNU_MONITOR
+       while ((kr = pmap_nest_ppl(grand, subord, vstart, nstart, size)) == KERN_RESOURCE_SHORTAGE) {
+               pmap_alloc_page_for_ppl();
+       }
+
+       pmap_ledger_check_balance(grand);
+       pmap_ledger_check_balance(subord);
+#else
        kr = pmap_nest_internal(grand, subord, vstart, nstart, size);
+#endif
 
        PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
 
@@ -9197,7 +11141,11 @@ pmap_unnest_options(
        PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
            VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
 
+#if XNU_MONITOR
+       kr = pmap_unnest_options_ppl(grand, vaddr, size, option);
+#else
        kr = pmap_unnest_options_internal(grand, vaddr, size, option);
+#endif
 
        PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, kr);
 
@@ -9471,6 +11419,11 @@ pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, un
 
        LOCK_PVH(pai);
 
+#if XNU_MONITOR
+       if (__improbable(pa_test_monitor(paddr))) {
+               panic("%s invoked on PPL page 0x%08x", __func__, pn);
+       }
+#endif
 
        pmap_update_cache_attributes_locked(pn, new_cacheattr);
 
@@ -9485,7 +11438,11 @@ pmap_map_compressor_page(ppnum_t pn)
 #if __ARM_PTE_PHYSMAP__
        unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
        if (cacheattr != VM_WIMG_DEFAULT) {
+#if XNU_MONITOR
+               pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
+#else
                pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
+#endif
        }
 #endif
        return (void*)phystokv(ptoa(pn));
@@ -9497,7 +11454,11 @@ pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
 #if __ARM_PTE_PHYSMAP__
        unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
        if (cacheattr != VM_WIMG_DEFAULT) {
+#if XNU_MONITOR
+               pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
+#else
                pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
+#endif
        }
 #endif
 }
@@ -9540,6 +11501,11 @@ pmap_batch_set_cache_attributes_internal(
 
        if (doit) {
                LOCK_PVH(pai);
+#if XNU_MONITOR
+               if (pa_test_monitor(paddr)) {
+                       panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
+               }
+#endif
        }
 
        do {
@@ -9611,7 +11577,11 @@ pmap_batch_set_cache_attributes(
        boolean_t doit,
        unsigned int *res)
 {
+#if XNU_MONITOR
+       return pmap_batch_set_cache_attributes_ppl(pn, cacheattr, page_cnt, page_index, doit, res);
+#else
        return pmap_batch_set_cache_attributes_internal(pn, cacheattr, page_cnt, page_index, doit, res);
+#endif
 }
 
 MARK_AS_PMAP_TEXT static void
@@ -9640,6 +11610,13 @@ pmap_set_cache_attributes_priv(
 
        LOCK_PVH(pai);
 
+#if XNU_MONITOR
+       if (external && pa_test_monitor(paddr)) {
+               panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
+       } else if (!external && !pa_test_monitor(paddr)) {
+               panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
+       }
+#endif
 
        do {
                pp_attr_current = pp_attr_table[pai];
@@ -9681,7 +11658,11 @@ pmap_set_cache_attributes(
        ppnum_t pn,
        unsigned int cacheattr)
 {
+#if XNU_MONITOR
+       pmap_set_cache_attributes_ppl(pn, cacheattr);
+#else
        pmap_set_cache_attributes_internal(pn, cacheattr);
+#endif
 }
 
 MARK_AS_PMAP_TEXT void
@@ -9705,7 +11686,11 @@ pmap_update_cache_attributes_locked(
 
        tmplate = *pte_p;
        tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
+#if XNU_MONITOR
+       tmplate |= (wimg_to_pte(attributes) & ~ARM_PTE_XPRR_MASK);
+#else
        tmplate |= wimg_to_pte(attributes);
+#endif
 #if (__ARM_VMSA__ > 7)
        if (tmplate & ARM_PTE_HINT_MASK) {
                panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
@@ -9817,8 +11802,13 @@ pmap_create_sharedpage(
        kern_return_t   kr;
        pmap_paddr_t    pa = 0;
 
+#if XNU_MONITOR
+       pa = pmap_alloc_page_for_kern();
+       assert(pa);
+#else
 
        (void) pmap_pages_alloc(&pa, PAGE_SIZE, 0);
+#endif
 
        memset((char *) phystokv(pa), 0, PAGE_SIZE);
 
@@ -9895,6 +11885,9 @@ pmap_insert_sharedpage_internal(
        int options = 0;
 
        VALIDATE_PMAP(pmap);
+#if XNU_MONITOR
+       options |= PMAP_OPTIONS_NOWAIT;
+#endif /* XNU_MONITOR */
 
 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
 #error We assume a single page.
@@ -9938,6 +11931,11 @@ pmap_insert_sharedpage_internal(
                kr = pmap_expand(pmap, sharedpage_vaddr, options, PMAP_TT_L2_LEVEL);
 
                if (kr != KERN_SUCCESS) {
+#if XNU_MONITOR
+                       if (kr == KERN_RESOURCE_SHORTAGE) {
+                               return kr;
+                       } else
+#endif
                        {
                                panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
                        }
@@ -10029,7 +12027,24 @@ void
 pmap_insert_sharedpage(
        pmap_t pmap)
 {
+#if XNU_MONITOR
+       kern_return_t kr = KERN_FAILURE;
+
+       while ((kr = pmap_insert_sharedpage_ppl(pmap)) == KERN_RESOURCE_SHORTAGE) {
+               pmap_alloc_page_for_ppl();
+       }
+
+       pmap_ledger_check_balance(pmap);
+
+       if (kr != KERN_SUCCESS) {
+               panic("%s: failed to insert the shared page, kr=%d, "
+                   "pmap=%p",
+                   __FUNCTION__, kr,
+                   pmap);
+       }
+#else
        pmap_insert_sharedpage_internal(pmap);
+#endif
 }
 
 static boolean_t
@@ -10139,7 +12154,11 @@ pmap_is_empty(
        vm_map_offset_t va_start,
        vm_map_offset_t va_end)
 {
+#if XNU_MONITOR
+       return pmap_is_empty_ppl(pmap, va_start, va_end);
+#else
        return pmap_is_empty_internal(pmap, va_start, va_end);
+#endif
 }
 
 vm_map_offset_t
@@ -10265,6 +12284,124 @@ pmap_flush(
        return;
 }
 
+#if XNU_MONITOR
+
+/*
+ * Enforce that the address range described by kva and nbytes is not currently
+ * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
+ * unintentionally writing to PPL-owned memory.
+ */
+static void
+pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
+{
+       vm_offset_t end;
+       if (os_add_overflow(kva, nbytes, &end)) {
+               panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
+       }
+       for (vm_offset_t ckva = kva; ckva < end; ckva = round_page(ckva + 1)) {
+               pmap_paddr_t pa = kvtophys(ckva);
+               if (!pa_valid(pa)) {
+                       panic("%s(%p): invalid physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
+               }
+               pp_attr_t attr;
+               unsigned int pai = (unsigned int)pa_index(pa);
+               if (ckva == phystokv(pa)) {
+                       panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
+               }
+               do {
+                       attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
+                       if (attr & PP_ATTR_MONITOR) {
+                               panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
+                       }
+               } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
+       }
+}
+
+static void
+pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
+{
+       vm_offset_t end;
+       if (os_add_overflow(kva, nbytes, &end)) {
+               panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
+       }
+       for (vm_offset_t ckva = kva; ckva < end; ckva = round_page(ckva + 1)) {
+               pmap_paddr_t pa = kvtophys(ckva);
+               if (!pa_valid(pa)) {
+                       panic("%s(%p): invalid physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
+               }
+               if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
+                       panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
+               }
+               assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
+               pa_clear_no_monitor(pa);
+       }
+}
+
+/*
+ * Lock down a page, making all mappings read-only, and preventing
+ * further mappings or removal of this particular kva's mapping.
+ * Effectively, it makes the page at kva immutable.
+ */
+MARK_AS_PMAP_TEXT static void
+pmap_ppl_lockdown_page(vm_address_t kva)
+{
+       pmap_paddr_t pa = kvtophys(kva);
+       unsigned int pai = (unsigned int)pa_index(pa);
+       LOCK_PVH(pai);
+       pv_entry_t **pv_h  = pai_to_pvh(pai);
+
+       if (pa_test_monitor(pa)) {
+               panic("%#lx: page %llx belongs to PPL", kva, pa);
+       }
+
+       if (pvh_get_flags(pv_h) & (PVH_FLAG_LOCKDOWN | PVH_FLAG_EXEC)) {
+               panic("%#lx: already locked down/executable (%#llx)", kva, pvh_get_flags(pv_h));
+       }
+
+       pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
+
+       if (pte_p == PT_ENTRY_NULL) {
+               panic("%#lx: NULL pte", kva);
+       }
+
+       pt_entry_t tmplate = *pte_p;
+       if ((tmplate & ARM_PTE_APMASK) != ARM_PTE_AP(AP_RWNA)) {
+               panic("%#lx: not a kernel r/w page (%#llx)", kva, tmplate & ARM_PTE_APMASK);
+       }
+
+       pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_LOCKDOWN);
+
+       pmap_set_ptov_ap(pai, AP_RONA, FALSE);
+
+       UNLOCK_PVH(pai);
+
+       pmap_page_protect_options_internal((ppnum_t)atop(pa), VM_PROT_READ, 0);
+}
+
+/*
+ * Release a page from being locked down to the PPL, making it writable
+ * to the kernel once again.
+ */
+MARK_AS_PMAP_TEXT static void
+pmap_ppl_unlockdown_page(vm_address_t kva)
+{
+       pmap_paddr_t pa = kvtophys(kva);
+       unsigned int pai = (unsigned int)pa_index(pa);
+       LOCK_PVH(pai);
+       pv_entry_t **pv_h  = pai_to_pvh(pai);
+
+       vm_offset_t pvh_flags = pvh_get_flags(pv_h);
+
+       if (!(pvh_flags & PVH_FLAG_LOCKDOWN)) {
+               panic("unlockdown attempt on not locked down virtual %#lx/pai %d", kva, pai);
+       }
+
+       pvh_set_flags(pv_h, pvh_flags & ~PVH_FLAG_LOCKDOWN);
+       pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
+       UNLOCK_PVH(pai);
+}
+
+#else /* XNU_MONITOR */
 
 static void __unused
 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
@@ -10276,6 +12413,7 @@ pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
 {
 }
 
+#endif /* !XNU_MONITOR */
 
 
 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
@@ -10378,7 +12516,11 @@ pmap_query_resident(
                if (l > end) {
                        l = end;
                }
+#if XNU_MONITOR
+               resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
+#else
                resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
+#endif
                if (resident_bytes == PMAP_RESIDENT_INVALID) {
                        break;
                }
@@ -11403,7 +13545,11 @@ void
 pmap_set_jit_entitled(
        pmap_t pmap)
 {
+#if XNU_MONITOR
+       pmap_set_jit_entitled_ppl(pmap);
+#else
        pmap_set_jit_entitled_internal(pmap);
+#endif
 }
 
 MARK_AS_PMAP_TEXT static kern_return_t
@@ -11483,7 +13629,11 @@ pmap_query_page_info(
        vm_map_offset_t va,
        int             *disp_p)
 {
+#if XNU_MONITOR
+       return pmap_query_page_info_ppl(pmap, va, disp_p);
+#else
        return pmap_query_page_info_internal(pmap, va, disp_p);
+#endif
 }
 
 MARK_AS_PMAP_TEXT kern_return_t
@@ -11496,7 +13646,11 @@ pmap_return_internal(__unused boolean_t do_panic, __unused boolean_t do_recurse)
 kern_return_t
 pmap_return(boolean_t do_panic, boolean_t do_recurse)
 {
+#if XNU_MONITOR
+       return pmap_return_ppl(do_panic, do_recurse);
+#else
        return pmap_return_internal(do_panic, do_recurse);
+#endif
 }
 
 
@@ -11525,7 +13679,11 @@ pmap_footprint_suspend(
        vm_map_t map,
        boolean_t suspend)
 {
+#if XNU_MONITOR
+       pmap_footprint_suspend_ppl(map, suspend);
+#else
        pmap_footprint_suspend_internal(map, suspend);
+#endif
 }
 
 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
index e56129770d4359677d608bae3eb7fa962f104892..4cfec3ecfff9468b448d1dabd4581a9169341202 100644 (file)
 #define CPUWINDOWS_MAX              4
 
 struct pmap_cpu_data {
+#if XNU_MONITOR
+       uint64_t cpu_id;
+       void * ppl_kern_saved_sp;
+       void * ppl_stack;
+       arm_context_t * save_area;
+       unsigned int ppl_state;
+#endif
 #if defined(__arm64__)
        pmap_t cpu_nested_pmap;
 #else
@@ -212,6 +219,9 @@ extern void set_mmu_ttb_alternate(uint64_t);
 extern uint64_t get_tcr(void);
 extern void set_tcr(uint64_t);
 extern uint64_t pmap_get_arm64_prot(pmap_t, vm_offset_t);
+#if defined(HAS_VMSA_LOCK)
+extern void vmsa_lock(void);
+#endif
 #else
 extern uint32_t get_mmu_control(void);
 extern void set_mmu_control(uint32_t);
@@ -393,6 +403,10 @@ extern  void pmap_gc(void);
 #if defined(__arm64__)
 extern vm_offset_t pmap_extract(pmap_t pmap, vm_map_offset_t va);
 #endif
+#if HAS_APPLE_PAC && XNU_MONITOR
+extern void * pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t data);
+extern void * pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t data);
+#endif /* HAS_APPLE_PAC && XNU_MONITOR */
 
 /*
  * Interfaces implemented as macros.
@@ -538,6 +552,10 @@ boolean_t pmap_enforces_execute_only(pmap_t pmap);
 #define PMAP_LEDGER_ALLOC_INDEX 66
 #define PMAP_LEDGER_FREE_INDEX 67
 
+#if HAS_APPLE_PAC && XNU_MONITOR
+#define PMAP_SIGN_USER_PTR 68
+#define PMAP_AUTH_USER_PTR 69
+#endif /* HAS_APPLE_PAC && XNU_MONITOR */
 
 
 #define PMAP_COUNT 71
@@ -554,23 +572,82 @@ extern void pmap_cpu_data_init(void);
 /* Get the pmap per-CPU data for the current CPU. */
 extern pmap_cpu_data_t * pmap_get_cpu_data(void);
 
+#if XNU_MONITOR
+extern boolean_t pmap_ppl_locked_down;
+
+/*
+ * Denotes the bounds of the PPL stacks.  These are visible so that other code
+ * can check if addresses are part of the PPL stacks.
+ */
+extern void * pmap_stacks_start;
+extern void * pmap_stacks_end;
+
+/* Asks if a page belongs to the monitor. */
+extern boolean_t pmap_is_monitor(ppnum_t pn);
+
+/*
+ * Indicates that we are done with our static bootstrap
+ * allocations, so the monitor may now mark the pages
+ * that it owns.
+ */
+extern void pmap_static_allocations_done(void);
+
+/*
+ * Indicates that we are done mutating sensitive state in the system, and that
+ * the PPL may now restict write access to PPL owned mappings.
+ */
+extern void pmap_lockdown_ppl(void);
+
+
+#ifdef KASAN
+#define PPL_STACK_SIZE (PAGE_SIZE << 2)
+#else
+#define PPL_STACK_SIZE PAGE_SIZE
+#endif
+
+/* One stack for each CPU, plus a guard page below each stack and above the last stack */
+#define PPL_STACK_REGION_SIZE ((MAX_CPUS * (PPL_STACK_SIZE + ARM_PGBYTES)) + ARM_PGBYTES)
+
+#define PPL_DATA_SEGMENT_SECTION_NAME "__PPLDATA,__data"
+#define PPL_TEXT_SEGMENT_SECTION_NAME "__PPLTEXT,__text,regular,pure_instructions"
+#define PPL_DATACONST_SEGMENT_SECTION_NAME "__PPLDATA,__const"
+
+#define MARK_AS_PMAP_DATA \
+       __PLACE_IN_SECTION(PPL_DATA_SEGMENT_SECTION_NAME)
+#define MARK_AS_PMAP_TEXT \
+       __attribute__((used, section(PPL_TEXT_SEGMENT_SECTION_NAME), noinline))
+#define MARK_AS_PMAP_RODATA \
+       __PLACE_IN_SECTION(PPL_DATACONST_SEGMENT_SECTION_NAME)
+
+#else /* XNU_MONITOR */
 
 #define MARK_AS_PMAP_TEXT
 #define MARK_AS_PMAP_DATA
 #define MARK_AS_PMAP_RODATA
 
+#endif /* !XNU_MONITOR */
 
 
 extern kern_return_t pmap_return(boolean_t do_panic, boolean_t do_recurse);
 
 extern lck_grp_t pmap_lck_grp;
 
+#if XNU_MONITOR
+extern void CleanPoC_DcacheRegion_Force_nopreempt(vm_offset_t va, unsigned length);
+#define pmap_force_dcache_clean(va, sz) CleanPoC_DcacheRegion_Force_nopreempt(va, sz)
+#define pmap_simple_lock(l)             simple_lock_nopreempt(l, &pmap_lck_grp)
+#define pmap_simple_unlock(l)           simple_unlock_nopreempt(l)
+#define pmap_simple_lock_try(l)         simple_lock_try_nopreempt(l, &pmap_lck_grp)
+#define pmap_lock_bit(l, i)             hw_lock_bit_nopreempt(l, i, &pmap_lck_grp)
+#define pmap_unlock_bit(l, i)           hw_unlock_bit_nopreempt(l, i)
+#else
 #define pmap_force_dcache_clean(va, sz) CleanPoC_DcacheRegion_Force(va, sz)
 #define pmap_simple_lock(l)             simple_lock(l, &pmap_lck_grp)
 #define pmap_simple_unlock(l)           simple_unlock(l)
 #define pmap_simple_lock_try(l)         simple_lock_try(l, &pmap_lck_grp)
 #define pmap_lock_bit(l, i)             hw_lock_bit(l, i, &pmap_lck_grp)
 #define pmap_unlock_bit(l, i)           hw_unlock_bit(l, i)
+#endif
 
 #endif /* #ifndef ASSEMBLER */
 
index 192bc9d692e7684eba3a127c1baf919a8be32a6c..c5921cede188851862270dcddb5be7a3377e8e66 100644 (file)
 #define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64)
 #define __ARM_CLUSTER_COUNT__                2
 
+#elif defined (APPLEVORTEX)
+#define __ARM_ARCH__                         8
+#define __ARM_VMSA__                         8
+#define __ARM_SMP__                          1
+#define __ARM_VFP__                          4
+#define __ARM_COHERENT_CACHE__               1
+#define __ARM_COHERENT_IO__                  1
+#define __ARM_IC_NOALIAS_ICACHE__            1
+#define __ARM_DEBUG__                        7
+#define __ARM_ENABLE_SWAP__                  1
+#define __ARM_V8_CRYPTO_EXTENSIONS__         1
+#define __ARM_16K_PG__                       1
+#define __ARM64_PMAP_SUBPAGE_L1__            1
+#define __ARM_GLOBAL_SLEEP_BIT__             1
+#define __ARM_PAN_AVAILABLE__                1
+#define __ARM_WKDM_ISA_AVAILABLE__           1
+#define __PLATFORM_WKDM_ALIGNMENT_MASK__     (0x3FULL)
+#define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64)
+#define __ARM_CLUSTER_COUNT__                2
+
+#elif defined (APPLELIGHTNING)
+#define __ARM_ARCH__                         8
+#define __ARM_VMSA__                         8
+#define __ARM_SMP__                          1
+#define __ARM_AMP__                          1
+#define __ARM_VFP__                          4
+#define __ARM_COHERENT_CACHE__               1
+#define __ARM_COHERENT_IO__                  1
+#define __ARM_IC_NOALIAS_ICACHE__            1
+#define __ARM_DEBUG__                        7
+#define __ARM_ENABLE_SWAP__                  1
+#define __ARM_V8_CRYPTO_EXTENSIONS__         1
+#define __ARM_16K_PG__                       1
+#define __ARM64_PMAP_SUBPAGE_L1__            1
+#define __ARM_GLOBAL_SLEEP_BIT__             1
+#define __ARM_PAN_AVAILABLE__                1
+#define __ARM_WKDM_ISA_AVAILABLE__           1
+#define __PLATFORM_WKDM_ALIGNMENT_MASK__     (0x3FULL)
+#define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64)
+#define __ARM_CLUSTER_COUNT__                2
+#define
+#define __APCFG_SUPPORTED__                  1
+#define __ARM_RANGE_TLBI__                   1
+
 #elif defined (BCM2837)
 #define __ARM_ARCH__              8
 #define __ARM_VMSA__              8
 #define L2_SWAY         (L2_CSIZE - L2_NWAY)     /* set size 1<<L2_SWAY */
 #define L2_NSET         (L2_SWAY - L2_CLINE)     /* lines per way 1<<L2_NSET */
 
+#elif defined (APPLEVORTEX)
+
+/* I-Cache, 128KB 8-way for Vortex, 48KB 6-way for Tempest. */
+#define MMU_I_CLINE 6                      /* cache line size as 1<<MMU_I_CLINE (64) */
+
+/* D-Cache, 128KB 8-way for Vortex, 32KB 4-way for Tempest. */
+#define MMU_CSIZE   17                     /* cache size as 1<<MMU_CSIZE (128K) */
+#define MMU_CLINE   6                      /* cache line size is 1<<MMU_CLINE (64) */
+#define MMU_NWAY    3                      /* set associativity 1<<MMU_NWAY (8) */
+#define MMU_I7SET   6                      /* cp15 c7 set incrementer 1<<MMU_I7SET */
+#define MMU_I7WAY   30                     /* cp15 c7 way incrementer 1<<MMU_I7WAY */
+#define MMU_I9WAY   30                     /* cp15 c9 way incrementer 1<<MMU_I9WAY */
+
+#define MMU_SWAY    (MMU_CSIZE - MMU_NWAY) /* set size 1<<MMU_SWAY */
+#define MMU_NSET    (MMU_SWAY - MMU_CLINE) /* lines per way 1<<MMU_NSET */
+
+/* L2-Cache */
+#define __ARM_L2CACHE__ 1
+
+/*
+ * LLC (Vortex L2):  8MB, 128-byte lines, 16-way.
+ * LLC (Tempest L2): 2MB, 128-byte lines, 16-way.
+ */
+#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__ /* cache size as 1<<L2_CSIZE */
+#define L2_CLINE        7                        /* cache line size as 1<<L2_CLINE (128) */
+#define L2_NWAY         4                        /* set associativity as 1<<L2_NWAY (16) */
+#define L2_I7SET        6                        /* TODO: cp15 c7 set incrementer 1<<L2_I7SET */
+#define L2_I7WAY        28                       /* TODO: cp15 c7 way incrementer 1<<L2_I7WAY */
+#define L2_I9WAY        28                       /* TODO: cp15 c9 way incremenber 1<<L2_I9WAY */
+
+#define L2_SWAY         (L2_CSIZE - L2_NWAY)     /* set size 1<<L2_SWAY */
+#define L2_NSET         (L2_SWAY - L2_CLINE)     /* lines per way 1<<L2_NSET */
+
+#elif defined (APPLELIGHTNING)
+
+/* I-Cache, 192KB for Lightning, 96KB for Thunder, 6-way. */
+#define MMU_I_CLINE 6                      /* cache line size as 1<<MMU_I_CLINE (64) */
+
+/* D-Cache, 128KB for Lightning, 8-way. 48KB for Thunder, 6-way. */
+#define MMU_CSIZE   17                     /* cache size as 1<<MMU_CSIZE (128K) */
+#define MMU_CLINE   6                      /* cache line size is 1<<MMU_CLINE (64) */
+#define MMU_NWAY    3                      /* set associativity 1<<MMU_NWAY (8) */
+#define MMU_I7SET   6                      /* cp15 c7 set incrementer 1<<MMU_I7SET */
+#define MMU_I7WAY   30                     /* cp15 c7 way incrementer 1<<MMU_I7WAY */
+#define MMU_I9WAY   30                     /* cp15 c9 way incrementer 1<<MMU_I9WAY */
+
+#define MMU_SWAY    (MMU_CSIZE - MMU_NWAY) /* set size 1<<MMU_SWAY */
+#define MMU_NSET    (MMU_SWAY - MMU_CLINE) /* lines per way 1<<MMU_NSET */
+
+/* L2-Cache */
+#define __ARM_L2CACHE__ 1
+
+/*
+ * LLC (Lightning L2):  8MB, 128-byte lines, 16-way.
+ * LLC (Thunder L2): 4MB, 128-byte lines, 16-way.
+ */
+#define L2_CSIZE        __ARM_L2CACHE_SIZE_LOG__ /* cache size as 1<<L2_CSIZE */
+#define L2_CLINE        7                        /* cache line size as 1<<L2_CLINE (128) */
+#define L2_NWAY         4                        /* set associativity as 1<<L2_NWAY (16) */
+#define L2_I7SET        6                        /* TODO: cp15 c7 set incrementer 1<<L2_I7SET */
+#define L2_I7WAY        28                       /* TODO: cp15 c7 way incrementer 1<<L2_I7WAY */
+#define L2_I9WAY        28                       /* TODO: cp15 c9 way incremenber 1<<L2_I9WAY */
+
+#define L2_SWAY         (L2_CSIZE - L2_NWAY)     /* set size 1<<L2_SWAY */
+#define L2_NSET         (L2_SWAY - L2_CLINE)     /* lines per way 1<<L2_NSET */
+
 #elif defined (BCM2837) /* Raspberry Pi 3 */
 
 /* I-Cache. We don't have detailed spec so we just follow the ARM technical reference. */
index 8f6a0cbb75dc0488f833044380bd5813d7a79fbe..acdb849eee2efdee12de05a0d3e7ae4d6a62c72e 100644 (file)
@@ -70,7 +70,18 @@ static_assert((KERNEL_PMAP_HEAP_RANGE_START & ~ARM_TT_ROOT_OFFMASK) > ARM_KERNEL
 static_assert((((~ARM_KERNEL_PROTECT_EXCEPTION_START) + 1) * 2ULL) <= (ARM_TT_ROOT_SIZE + ARM_TT_ROOT_INDEX_MASK));
 #endif /* __ARM_KERNEL_PROTECT__ */
 
+#if __APRR_SUPPORTED__ && XNU_MONITOR
+/*
+ * If APRR is supported, setting XN on L1/L2 table entries will shift the effective
+ * APRR index of L3 PTEs covering PPL-protected pages in the kernel dynamic region
+ * from PPL R/W to kernel R/W.  That will effectively remove PPL write protection
+ * from those pages.  Avoid setting XN at the table level for MONITOR-enabled builds
+ * that are backed by APRR.
+ */
+#define ARM_DYNAMIC_TABLE_XN ARM_TTE_TABLE_PXN
+#else
 #define ARM_DYNAMIC_TABLE_XN (ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN)
+#endif
 
 #if KASAN
 extern vm_offset_t shadow_pbase;
@@ -194,6 +205,18 @@ SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWEST;
 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segTEXTB;
 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeTEXT;
 
+#if XNU_MONITOR
+SECURITY_READ_ONLY_LATE(vm_offset_t)          segPPLTEXTB;
+SECURITY_READ_ONLY_LATE(unsigned long)        segSizePPLTEXT;
+
+SECURITY_READ_ONLY_LATE(vm_offset_t)          segPPLTRAMPB;
+SECURITY_READ_ONLY_LATE(unsigned long)        segSizePPLTRAMP;
+
+SECURITY_READ_ONLY_LATE(vm_offset_t)          segPPLDATACONSTB;
+SECURITY_READ_ONLY_LATE(unsigned long)        segSizePPLDATACONST;
+SECURITY_READ_ONLY_LATE(void *)               pmap_stacks_start = NULL;
+SECURITY_READ_ONLY_LATE(void *)               pmap_stacks_end = NULL;
+#endif
 
 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segDATACONSTB;
 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeDATACONST;
@@ -204,6 +227,10 @@ SECURITY_READ_ONLY_LATE(static unsigned long) segSizeTEXTEXEC;
 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segDATAB;
 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeDATA;
 
+#if XNU_MONITOR
+SECURITY_READ_ONLY_LATE(vm_offset_t)          segPPLDATAB;
+SECURITY_READ_ONLY_LATE(unsigned long)        segSizePPLDATA;
+#endif
 
 SECURITY_READ_ONLY_LATE(vm_offset_t)          segBOOTDATAB;
 SECURITY_READ_ONLY_LATE(unsigned long)        segSizeBOOTDATA;
@@ -351,7 +378,7 @@ round_up_pte_hint_address(vm_offset_t address)
 vm_offset_t alloc_ptpage(boolean_t map_static) {
        vm_offset_t vaddr;
 
-#if !(defined(KERNEL_INTEGRITY_KTRR))
+#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR))
        map_static = FALSE;
 #endif
 
@@ -480,7 +507,7 @@ void dump_kva_space() {
 
 #endif /* DEBUG */
 
-#if __ARM_KERNEL_PROTECT__
+#if __ARM_KERNEL_PROTECT__ || XNU_MONITOR
 /*
  * arm_vm_map:
  *   root_ttp: The kernel virtual address for the root of the target page tables
@@ -555,7 +582,7 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte)
        *ptep = pte;
 }
 
-#endif // __ARM_KERNEL_PROTECT
+#endif // __ARM_KERNEL_PROTECT || XNU_MONITOR
 
 #if __ARM_KERNEL_PROTECT__
 
@@ -712,7 +739,7 @@ arm_vm_expand_kernel_el0_mappings(void)
 }
 #endif /* __ARM_KERNEL_PROTECT__ */
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
 extern void bootstrap_instructions;
 
 /*
@@ -777,7 +804,7 @@ static void arm_replace_identity_map(boot_args * args)
                ARM_PTE_AP(AP_RONA) |
                ARM_PTE_NX;
 }
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
 
 tt_entry_t *arm_kva_to_tte(vm_offset_t);
 
@@ -791,6 +818,16 @@ arm_kva_to_tte(vm_offset_t va)
        return tte2;
 }
 
+#if XNU_MONITOR
+
+static inline pt_entry_t *
+arm_kva_to_pte(vm_offset_t va)
+{
+       tt_entry_t *tte2 = arm_kva_to_tte(va);
+       return L3_TABLE_VA(tte2) + L3_TABLE_INDEX(va);
+}
+
+#endif
 
 #define ARM64_GRANULE_ALLOW_BLOCK (1 << 0)
 #define ARM64_GRANULE_ALLOW_HINT (1 << 1)
@@ -1096,13 +1133,34 @@ arm_vm_prot_init(boot_args * args)
         * NO, stuff in this segment gets modified during startup (viz. mac_policy_init()/mac_policy_list)
         * Make RNX in prot_finalize
         */
+#if XNU_MONITOR
+       /* The ropagetable region will ultimately be owned by the PPL.  Set permissions
+        * on it separately to avoid applying mismatched block settings between this function,
+        * pmap_static_allocations_done(), and arm_vm_prot_finalize(). */
+       vm_offset_t segDATACONSTE = segDATACONSTB + segSizeDATACONST;
+
+       arm_vm_page_granular_RWNX(segDATACONSTB, (vm_offset_t)&ropagetable_begin - segDATACONSTB, ARM64_GRANULE_ALLOW_BLOCK);
+       arm_vm_page_granular_RWNX((vm_offset_t)&ropagetable_begin,
+           (vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin, ARM64_GRANULE_ALLOW_BLOCK);
+       arm_vm_page_granular_RWNX((vm_offset_t)&ropagetable_end,
+           segDATACONSTE - (vm_offset_t)&ropagetable_end, ARM64_GRANULE_ALLOW_BLOCK);
+#else
        arm_vm_page_granular_RWNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
+#endif
 
        arm_vm_page_granular_ROX(segTEXTEXECB, segSizeTEXTEXEC, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
 
+#if XNU_MONITOR
+       arm_vm_page_granular_ROX(segPPLTEXTB, segSizePPLTEXT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+       arm_vm_page_granular_ROX(segPPLTRAMPB, segSizePPLTRAMP, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+       arm_vm_page_granular_RNX(segPPLDATACONSTB, segSizePPLDATACONST, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+#endif
 
        /* DATA segment will remain RWNX */
        arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+#if XNU_MONITOR
+       arm_vm_page_granular_RWNX(segPPLDATAB, segSizePPLDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+#endif
 
        arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, 0);
        arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, 0);
@@ -1179,6 +1237,67 @@ arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap
        ++ptov_index;
 }
 
+#if XNU_MONITOR
+
+SECURITY_READ_ONLY_LATE(static boolean_t) keep_linkedit = FALSE;
+
+static void
+arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_address_t dynamic_memory_begin __unused)
+{
+       ptov_table_entry temp_ptov_table[PTOV_TABLE_SIZE];
+       bzero(temp_ptov_table, sizeof(temp_ptov_table));
+
+       // This is memory that will either be handed back to the VM layer via ml_static_mfree(),
+       // or will be available for general-purpose use.   Physical aperture mappings for this memory
+       // must be at page granularity, so that PPL ownership or cache attribute changes can be reflected
+       // in the physical aperture mappings.
+
+
+       // Slid region between gPhysBase and beginning of protected text
+       arm_vm_physmap_slide(temp_ptov_table, physmap_base, gVirtBase, segLOWEST - gVirtBase, AP_RWNA, 0);
+
+       // kext bootstrap segment
+       arm_vm_physmap_slide(temp_ptov_table, physmap_base, segKLDB, segSizeKLD, AP_RONA, 0);
+
+       // Early-boot data
+       arm_vm_physmap_slide(temp_ptov_table, physmap_base, segBOOTDATAB, segSizeBOOTDATA, AP_RONA, 0);
+
+#if KASAN_DYNAMIC_BLACKLIST
+       /* KASAN's dynamic blacklist needs to query the LINKEDIT segment at runtime.  As such, the
+        * kext bootstrap code will not jettison LINKEDIT on kasan kernels, so don't bother to relocate it. */
+       keep_linkedit = TRUE;
+#else
+       PE_parse_boot_argn("keepsyms", &keep_linkedit, sizeof(keep_linkedit));
+#endif
+       if (!keep_linkedit) {
+               // Kernel LINKEDIT
+               arm_vm_physmap_slide(temp_ptov_table, physmap_base, segLINKB, segSizeLINK, AP_RWNA, 0);
+
+               // Prelinked kernel LINKEDIT
+               arm_vm_physmap_slide(temp_ptov_table, physmap_base, segPLKLINKEDITB, segSizePLKLINKEDIT, AP_RWNA, 0);
+       }
+
+       // Prelinked kernel plists
+       arm_vm_physmap_slide(temp_ptov_table, physmap_base, segPRELINKINFOB, segSizePRELINKINFO, AP_RWNA, 0);
+
+       // Device tree, ramdisk, boot args
+       arm_vm_physmap_slide(temp_ptov_table, physmap_base, end_kern, (args->topOfKernelData - gPhysBase + gVirtBase) - end_kern, AP_RWNA, 0);
+       PE_slide_devicetree(temp_ptov_table[ptov_index - 1].va - end_kern);
+
+       // Remainder of physical memory
+       arm_vm_physmap_slide(temp_ptov_table, physmap_base, (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE - gPhysBase + gVirtBase),
+                            real_avail_end - (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE), AP_RWNA, 0);
+
+       assert((temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len) <= dynamic_memory_begin);
+
+       // Sort in descending order of segment length.  LUT traversal is linear, so largest (most likely used)
+       // segments should be placed earliest in the table to optimize lookup performance.
+       qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries);
+
+       memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table));
+}
+
+#else
 
 static void
 arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_address_t dynamic_memory_begin __unused)
@@ -1205,6 +1324,7 @@ arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_addre
        memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table));
 }
 
+#endif // XNU_MONITOR
 
 void
 arm_vm_prot_finalize(boot_args * args __unused)
@@ -1248,8 +1368,35 @@ arm_vm_prot_finalize(boot_args * args __unused)
        arm_vm_populate_kernel_el0_mappings();
 #endif /* __ARM_KERNEL_PROTECT__ */
 
+#if XNU_MONITOR
+       for (vm_offset_t va = segKLDB; va < (segKLDB + segSizeKLD); va += ARM_PGBYTES) {
+               pt_entry_t *pte = arm_kva_to_pte(va);
+               *pte = ARM_PTE_EMPTY;
+       }
+       /* Clear the original stack mappings; these pages should be mapped through ptov_table. */
+       for (vm_offset_t va = segBOOTDATAB; va < (segBOOTDATAB + segSizeBOOTDATA); va += ARM_PGBYTES) {
+               pt_entry_t *pte = arm_kva_to_pte(va);
+               *pte = ARM_PTE_EMPTY;
+       }
+       /* Clear the original PRELINKINFO mapping. This segment should be jettisoned during I/O Kit
+        * initialization before we reach this point. */
+       for (vm_offset_t va = segPRELINKINFOB; va < (segPRELINKINFOB + segSizePRELINKINFO); va += ARM_PGBYTES) {
+               pt_entry_t *pte = arm_kva_to_pte(va);
+               *pte = ARM_PTE_EMPTY;
+       }
+       if (!keep_linkedit) {
+               for (vm_offset_t va = segLINKB; va < (segLINKB + segSizeLINK); va += ARM_PGBYTES) {
+                       pt_entry_t *pte = arm_kva_to_pte(va);
+                       *pte = ARM_PTE_EMPTY;
+               }
+               for (vm_offset_t va = segPLKLINKEDITB; va < (segPLKLINKEDITB + segSizePLKLINKEDIT); va += ARM_PGBYTES) {
+                       pt_entry_t *pte = arm_kva_to_pte(va);
+                       *pte = ARM_PTE_EMPTY;
+               }
+       }
+#endif /* XNU_MONITOR */
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        /*
         * __LAST,__pinst should no longer be executable.
         */
@@ -1262,7 +1409,20 @@ arm_vm_prot_finalize(boot_args * args __unused)
         */
 #endif
 
+#if XNU_MONITOR
+       vm_offset_t segDATACONSTE = segDATACONSTB + segSizeDATACONST;
+
+       /*
+        * For the moment, the RO pagetable allocation is part of the
+        * constant data segment, but it is technically owned by the
+        * PPL.  Hence, we should not reprotect it.
+        */
+       arm_vm_page_granular_RNX(segDATACONSTB, (vm_offset_t)&ropagetable_begin - segDATACONSTB, ARM64_GRANULE_ALLOW_BLOCK);
+       arm_vm_page_granular_RNX((vm_offset_t)&ropagetable_end,
+           segDATACONSTE - (vm_offset_t)&ropagetable_end, ARM64_GRANULE_ALLOW_BLOCK);
+#else
        arm_vm_page_granular_RNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
+#endif
 
        __builtin_arm_dsb(DSB_ISH);
        flush_mmu_tlb();
@@ -1363,12 +1523,22 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
 
        physmap_base += physmap_slide;
 
+#if XNU_MONITOR
+       physmap_base = ROUND_TWIG(physmap_base);
+       static_memory_end = physmap_base + mem_size;
+#else
        static_memory_end = physmap_base + mem_size + (PTOV_TABLE_SIZE * ARM_TT_TWIG_SIZE); // worst possible case for block alignment
+#endif
 #if KASAN
        /* add the KASAN stolen memory to the physmap */
        dynamic_memory_begin = static_memory_end + (shadow_ptop - shadow_pbase);
 #else
        dynamic_memory_begin = static_memory_end;
+#endif
+#if XNU_MONITOR
+       pmap_stacks_start = (void*)dynamic_memory_begin;
+       dynamic_memory_begin += PPL_STACK_REGION_SIZE;
+       pmap_stacks_end = (void*)dynamic_memory_begin;
 #endif
        if (dynamic_memory_begin > VM_MAX_KERNEL_ADDRESS)
                panic("Unsupported memory configuration %lx\n", mem_size);
@@ -1394,7 +1564,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         */
        avail_start = boot_ttep + BOOTSTRAP_TABLE_SIZE;
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        arm_replace_identity_map(args);
 #endif
 
@@ -1460,7 +1630,15 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        segTEXTB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT", &segSizeTEXT);
        segDATACONSTB    = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA_CONST", &segSizeDATACONST);
        segTEXTEXECB     = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT_EXEC", &segSizeTEXTEXEC);
+#if XNU_MONITOR
+       segPPLTEXTB      = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLTEXT", &segSizePPLTEXT);
+       segPPLTRAMPB     = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLTRAMP", &segSizePPLTRAMP);
+       segPPLDATACONSTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLDATA_CONST", &segSizePPLDATACONST);
+#endif
        segDATAB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &segSizeDATA);
+#if XNU_MONITOR
+       segPPLDATAB      = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLDATA", &segSizePPLDATA);
+#endif
 
        segBOOTDATAB    = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__BOOTDATA", &segSizeBOOTDATA);
        segLINKB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK);
@@ -1566,6 +1744,9 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
 
        flush_mmu_tlb();
+#if defined(HAS_VMSA_LOCK)
+       vmsa_lock();
+#endif
        kva_active = TRUE;
        // global table pointers may need to be different due to physical aperture remapping
        cpu_tte = (tt_entry_t*)(phystokv(cpu_ttep));
@@ -1582,6 +1763,28 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        vm_kernel_etext = segTEXTB + segSizeTEXT + segSizeDATACONST + segSizeTEXTEXEC;
 
        dynamic_memory_begin = ROUND_TWIG(dynamic_memory_begin);
+#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
+       // reserve a 32MB region without permission overrides to use later for a CTRR unit test
+       {
+               extern vm_offset_t ctrr_test_page;
+               tt_entry_t *new_tte;
+
+               ctrr_test_page = dynamic_memory_begin;
+               dynamic_memory_begin += ARM_TT_L2_SIZE;
+               cpu_l1_tte = cpu_tte + ((ctrr_test_page & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
+               assert((*cpu_l1_tte) & ARM_TTE_VALID);
+               cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((ctrr_test_page & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
+               assert((*cpu_l2_tte) == ARM_TTE_EMPTY);
+               new_tte = (tt_entry_t *)alloc_ptpage(FALSE);
+               bzero(new_tte, ARM_PGBYTES);
+               *cpu_l2_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
+       }
+#endif /* defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) */
+#if XNU_MONITOR
+       for (vm_offset_t cur = (vm_offset_t)pmap_stacks_start; cur < (vm_offset_t)pmap_stacks_end; cur += ARM_PGBYTES) {
+               arm_vm_map(cpu_tte, cur, ARM_PTE_EMPTY);
+       }
+#endif
        pmap_bootstrap(dynamic_memory_begin);
 
        disable_preemption();
@@ -1708,6 +1911,9 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         */
        avail_start = (avail_start + PAGE_MASK) & ~PAGE_MASK;
 
+#if XNU_MONITOR
+       pmap_static_allocations_done();
+#endif
        first_avail = avail_start;
        patch_low_glo_static_region(args->topOfKernelData, avail_start - args->topOfKernelData);
        enable_preemption();
index 2360e698264a7eaedcdc6aa166d8c748b4be786f..1ab9c9f1d63d61af8b48f4e78be7d6feca758724 100644 (file)
@@ -90,6 +90,9 @@ extern void typhoon_prepare_for_wfi(void);
 extern void typhoon_return_from_wfi(void);
 #endif
 
+#if HAS_RETENTION_STATE
+extern void arm64_retention_wfi(void);
+#endif
 
 vm_address_t   start_cpu_paddr;
 
@@ -402,7 +405,11 @@ cpu_idle(void)
                typhoon_prepare_for_wfi();
 #endif
                __builtin_arm_dsb(DSB_SY);
+#if HAS_RETENTION_STATE
+               arm64_retention_wfi();
+#else
                __builtin_arm_wfi();
+#endif
 
 #if defined(APPLETYPHOON)
                // <rdar://problem/15827409> CPU1 Stuck in WFIWT Because of MMU Prefetch
@@ -646,6 +653,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
                cpu_data_ptr->coresight_base[i] = 0;
        }
 
+#if !XNU_MONITOR
        pmap_cpu_data_t * pmap_cpu_data_ptr = &cpu_data_ptr->cpu_pmap_cpu_data;
 
        pmap_cpu_data_ptr->cpu_nested_pmap = (struct pmap *) NULL;
@@ -654,6 +662,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
        for (i = 0; i < (sizeof(pmap_cpu_data_ptr->cpu_asid_high_bits) / sizeof(*pmap_cpu_data_ptr->cpu_asid_high_bits)); i++) {
                pmap_cpu_data_ptr->cpu_asid_high_bits[i] = 0;
        }
+#endif
        cpu_data_ptr->halt_status = CPU_NOT_HALTED;
 #if __ARM_KERNEL_PROTECT__
        cpu_data_ptr->cpu_exc_vectors = (vm_offset_t)&exc_vectors_table;
@@ -681,6 +690,20 @@ cpu_data_register(cpu_data_t *cpu_data_ptr)
        return KERN_SUCCESS;
 }
 
+#if defined(KERNEL_INTEGRITY_CTRR)
+
+lck_spin_t ctrr_cpu_start_lck;
+bool ctrr_cluster_locked[__ARM_CLUSTER_COUNT__];
+
+void
+init_ctrr_cpu_start_lock(void)
+{
+       lck_grp_t *ctrr_cpu_start_lock_grp = lck_grp_alloc_init("ctrr_cpu_start_lock", 0);
+       assert(ctrr_cpu_start_lock_grp);
+       lck_spin_init(&ctrr_cpu_start_lck, ctrr_cpu_start_lock_grp, NULL);
+}
+
+#endif
 
 kern_return_t
 cpu_start(int cpu)
@@ -697,7 +720,9 @@ cpu_start(int cpu)
 
                cpu_data_ptr->cpu_reset_handler = (vm_offset_t) start_cpu_paddr;
 
+#if !XNU_MONITOR
                cpu_data_ptr->cpu_pmap_cpu_data.cpu_nested_pmap = NULL;
+#endif
 
                if (cpu_data_ptr->cpu_processor->startup_thread != THREAD_NULL) {
                        first_thread = cpu_data_ptr->cpu_processor->startup_thread;
@@ -711,6 +736,22 @@ cpu_start(int cpu)
 
                flush_dcache((vm_offset_t)&CpuDataEntries[cpu], sizeof(cpu_data_entry_t), FALSE);
                flush_dcache((vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t), FALSE);
+#if defined(KERNEL_INTEGRITY_CTRR)
+               /* first time CPU starts, if not cluster master, and if cluster is not already locked,
+                * block until cluster becomes locked. */
+               if (cpu_data_ptr->cpu_processor->active_thread == THREAD_NULL
+                   && !cpu_data_ptr->cluster_master) {
+                       lck_spin_lock(&ctrr_cpu_start_lck);
+                       if (ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id] == 0) {
+                               assert_wait(&ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id], THREAD_UNINT);
+                               lck_spin_unlock(&ctrr_cpu_start_lck);
+                               thread_block(THREAD_CONTINUE_NULL);
+                               assert(ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id] == 1);
+                       } else {
+                               lck_spin_unlock(&ctrr_cpu_start_lck);
+                       }
+               }
+#endif
                (void) PE_cpu_start(cpu_data_ptr->cpu_id, (vm_offset_t)NULL, (vm_offset_t)NULL);
        }
 
index 41bfa1f68aca851f5663225f606c799f258620d9..60aa8b83fd6ae1b62e4482e8c7a3c86dcc4949e5 100644 (file)
 #include <pexpert/arm64/board_config.h>
 #endif
 
+#if XNU_MONITOR
+/* Exit path defines; for controlling PPL -> kernel transitions. */
+#define PPL_EXIT_DISPATCH   0 /* This is a clean exit after a PPL request. */
+#define PPL_EXIT_PANIC_CALL 1 /* The PPL has called panic. */
+#define PPL_EXIT_BAD_CALL   2 /* The PPL request failed. */
+#define PPL_EXIT_EXCEPTION  3 /* The PPL took an exception. */
+
+#define KERNEL_MODE_ELR      ELR_GL11
+#define KERNEL_MODE_FAR      FAR_GL11
+#define KERNEL_MODE_ESR      ESR_GL11
+#define KERNEL_MODE_SPSR     SPSR_GL11
+#define KERNEL_MODE_ASPSR    ASPSR_GL11
+#define KERNEL_MODE_VBAR     VBAR_GL11
+#define KERNEL_MODE_TPIDR    TPIDR_GL11
+
+#define GUARDED_MODE_ELR     ELR_EL1
+#define GUARDED_MODE_FAR     FAR_EL1
+#define GUARDED_MODE_ESR     ESR_EL1
+#define GUARDED_MODE_SPSR    SPSR_EL1
+#define GUARDED_MODE_ASPSR   ASPSR_EL1
+#define GUARDED_MODE_VBAR    VBAR_EL1
+#define GUARDED_MODE_TPIDR   TPIDR_EL1
+
+/*
+ * GET_PMAP_CPU_DATA
+ *
+ * Retrieves the PPL per-CPU data for the current CPU.
+ *   arg0 - Address of the PPL per-CPU data is returned through this
+ *   arg1 - Scratch register
+ *   arg2 - Scratch register
+ *
+ */
+.macro GET_PMAP_CPU_DATA
+/* Get the CPU ID. */
+mrs            $0, MPIDR_EL1
+#ifdef CPU_CLUSTER_OFFSETS
+ubfx           $1, $0, MPIDR_AFF1_SHIFT, MPIDR_AFF1_WIDTH
+cmp            $1, __ARM_CLUSTER_COUNT__
+b.hs   .
+adrp   $2, EXT(pmap_cluster_offsets)@page
+add            $2, $2, EXT(pmap_cluster_offsets)@pageoff
+ldr            $1, [$2, $1, lsl #3]
+and            $0, $0, MPIDR_AFF0_MASK
+add            $0, $0, $1
+#else
+and            $0, $0, MPIDR_AFF0_MASK
+#endif
+
+/* Get the PPL CPU data array. */
+adrp   $1, EXT(pmap_cpu_data_array)@page
+add            $1, $1, EXT(pmap_cpu_data_array)@pageoff
+
+/*
+ * Sanity check the CPU ID (this is not a panic because this pertains to
+ * the hardware configuration; this should only fail if our
+ * understanding of the hardware is incorrect).
+ */
+cmp            $0, MAX_CPUS
+b.hs   .
+
+mov            $2, PMAP_CPU_DATA_ARRAY_ENTRY_SIZE
+/* Get the PPL per-CPU data. */
+madd   $0, $0, $2, $1
+.endmacro
+#endif /* XNU_MONITOR */
 
 /*
  * INIT_SAVED_STATE_FLAVORS
index c47c6ab1a82bffc166ceae7d1b350864701fa1dc..511460bdcd7e52d84b2c97aae543a8ceaca5c880 100644 (file)
@@ -257,6 +257,14 @@ main(int     argc,
 
        DECLARE("SR_RESTORE_TCR_EL1", offsetof(struct sysreg_restore, tcr_el1));
 
+#if XNU_MONITOR
+       DECLARE("PMAP_CPU_DATA_PPL_STATE", offsetof(struct pmap_cpu_data, ppl_state));
+       DECLARE("PMAP_CPU_DATA_ARRAY_ENTRY_SIZE", sizeof(struct pmap_cpu_data_array_entry));
+       DECLARE("PMAP_CPU_DATA_PPL_STACK", offsetof(struct pmap_cpu_data, ppl_stack));
+       DECLARE("PMAP_CPU_DATA_KERN_SAVED_SP", offsetof(struct pmap_cpu_data, ppl_kern_saved_sp));
+       DECLARE("PMAP_CPU_DATA_SAVE_AREA", offsetof(struct pmap_cpu_data, save_area));
+       DECLARE("PMAP_COUNT", PMAP_COUNT);
+#endif /* XNU_MONITOR */
 
 
 #if defined(HAS_APPLE_PAC)
index f9162a819ba186cd98c92df289e033e3749069f7..660a59f1b9378d8ae4514eb22e85461202148d1b 100644 (file)
 #include <arm/pmap.h>
 #endif
 
+#if XNU_MONITOR
+/*
+ * CHECK_EXCEPTION_RETURN_DISPATCH_PPL
+ *
+ * Checks if an exception was taken from the PPL, and if so, trampolines back
+ * into the PPL.
+ *   x26 - 0 if the exception was taken while in the kernel, 1 if the
+ *         exception was taken while in the PPL.
+ */
+.macro CHECK_EXCEPTION_RETURN_DISPATCH_PPL
+       cmp             x26, xzr
+       b.eq            1f
+
+       /* Return to the PPL. */
+       mov             x15, #0
+       mov             w10, #PPL_STATE_EXCEPTION
+#if __APRR_SUPPORTED__
+       b               Ldisable_aif_and_enter_ppl
+#else
+#error "XPRR configuration error"
+#endif /* __APRR_SUPPORTED__ */
+1:
+.endmacro
+
+#if __APRR_SUPPORTED__
+/*
+ * EL1_SP0_VECTOR_PPL_CHECK
+ *
+ * Check to see if the exception was taken by the kernel or the PPL.  Falls
+ * through if kernel, hands off to the given label if PPL.  Expects to run on
+ * SP1.
+ *   arg0 - Label to go to if this was a PPL exception.
+ */
+.macro EL1_SP0_VECTOR_PPL_CHECK
+       sub             sp, sp, ARM_CONTEXT_SIZE
+       stp             x0, x1, [sp, SS64_X0]
+       mrs             x0, APRR_EL1
+       MOV64           x1, APRR_EL1_DEFAULT
+       cmp             x0, x1
+       b.ne            $0
+       ldp             x0, x1, [sp, SS64_X0]
+       add             sp, sp, ARM_CONTEXT_SIZE
+.endmacro
+
+#define STAY_ON_SP1 0
+#define SWITCH_TO_SP0 1
+
+#define INVOKE_PREFLIGHT 0
+#define NO_INVOKE_PREFLIGHT 1
+
+/*
+ * EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE
+ *
+ * Verify whether an exception came from the PPL or from the kernel.  If it came
+ * from the PPL, save off the PPL state and transition out of the PPL.
+ *   arg0 - Label to go to if this was a kernel exception
+ *   arg1 - Label to go to (after leaving the PPL) if this was a PPL exception
+ *   arg2 - Indicates if this should switch back to SP0
+ *   x0   - xPRR_EL1_BR1 read by EL1_SP0_VECTOR_PPL_CHECK
+ */
+.macro EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE
+       /* Spill some more registers. */
+       stp             x2, x3, [sp, SS64_X2]
+
+       /*
+        * Check if the PPL is locked down; if not, we can treat this as a
+        * kernel execption.
+        */
+       adrp    x1, EXT(pmap_ppl_locked_down)@page
+       ldr             w1, [x1, #EXT(pmap_ppl_locked_down)@pageoff]
+       cbz             x1, 2f
+
+       /* Ensure that APRR_EL1 is actually in PPL mode. */
+       MOV64           x1, APRR_EL1_PPL
+       cmp             x0, x1
+       b.ne            .
+
+       /*
+        * Check if the CPU is in the PPL; if not we can treat this as a
+        * kernel exception.
+        */
+       GET_PMAP_CPU_DATA       x3, x1, x2
+       ldr             w1, [x3, PMAP_CPU_DATA_PPL_STATE]
+       cmp             x1, #PPL_STATE_KERNEL
+       b.eq            2f
+
+       /* Ensure that the CPU is in the expected PPL state. */
+       cmp             x1, #PPL_STATE_DISPATCH
+       b.ne            .
+
+       /* Mark the CPU as dealing with an exception. */
+       mov             x1, #PPL_STATE_EXCEPTION
+       str             w1, [x3, PMAP_CPU_DATA_PPL_STATE]
+
+       /* Load the bounds of the PPL trampoline. */
+       adrp    x0, EXT(ppl_no_exception_start)@page
+       add             x0, x0, EXT(ppl_no_exception_start)@pageoff
+       adrp    x1, EXT(ppl_no_exception_end)@page
+       add             x1, x1, EXT(ppl_no_exception_end)@pageoff
+
+       /*
+        * Ensure that the exception did not occur in the trampoline.  If it
+        * did, we are either being attacked or our state machine is
+        * horrifically broken.
+        */
+       mrs             x2, ELR_EL1
+       cmp             x2, x0
+       b.lo            1f
+       cmp             x2, x1
+       b.hi            1f
+
+       /* We might be under attack; spin. */
+       b               .
+
+1:
+       /* Get the PPL save area. */
+       mov             x1, x3
+       ldr             x0, [x3, PMAP_CPU_DATA_SAVE_AREA]
+
+       /* Save our x0, x1 state. */
+       ldp             x2, x3, [sp, SS64_X0]
+       stp             x2, x3, [x0, SS64_X0]
+
+       /* Restore SP1 to its original state. */
+       mov             x3, sp
+       add             sp, sp, ARM_CONTEXT_SIZE
+
+       .if $2 == SWITCH_TO_SP0
+       /* Switch back to SP0. */
+       msr             SPSel, #0
+       mov             x2, sp
+       .else
+       /* Load the SP0 value. */
+       mrs             x2, SP_EL0
+       .endif
+
+       /* Save off the stack pointer. */
+       str             x2, [x0, SS64_SP]
+
+       INIT_SAVED_STATE_FLAVORS x0, w1, w2
+
+       /* Save the context that was interrupted. */ 
+       ldp             x2, x3, [x3, SS64_X2]
+       stp             fp, lr, [x0, SS64_FP]
+       SPILL_REGISTERS KERNEL_MODE
+
+       /*
+        * Stash the function we wish to be invoked to deal with the exception;
+        * usually this is some preflight function for the fleh_* handler.
+        */
+       adrp            x25, $1@page
+       add             x25, x25, $1@pageoff
+
+       /*
+        * Indicate that this is a PPL exception, and that we should return to
+        * the PPL.
+        */
+       mov             x26, #1
+
+       /* Transition back to kernel mode. */
+       mov             x15, #PPL_EXIT_EXCEPTION
+       b               ppl_return_to_kernel_mode
+2:
+       /* Restore SP1 state. */
+       ldp             x2, x3, [sp, SS64_X2]
+       ldp             x0, x1, [sp, SS64_X0]
+       add             sp, sp, ARM_CONTEXT_SIZE
+
+       /* Go to the specified label (usually the original exception vector). */
+       b               $0
+.endmacro
+#endif /* __APRR_SUPPORTED__ */
+
+#endif /* XNU_MONITOR */
 
 #define        CBF_DISABLE     0
 #define        CBF_ENABLE      1
@@ -239,6 +413,14 @@ Lel0_serror_vector_64:
 .endmacro
 
 el1_sp0_synchronous_vector_long:
+#if XNU_MONITOR && __APRR_SUPPORTED__
+       /*
+        * We do not have enough space for new instructions in this vector, so
+        * jump to outside code to check if this exception was taken in the PPL.
+        */
+       b               el1_sp0_synchronous_vector_ppl_check
+Lel1_sp0_synchronous_vector_kernel:
+#endif
        sub             sp, sp, ARM_CONTEXT_SIZE                        // Make space on the exception stack
        stp             x0, x1, [sp, SS64_X0]                           // Save x0, x1 to the stack
        mrs             x1, ESR_EL1                                                     // Get the exception syndrome
@@ -261,6 +443,10 @@ Lkernel_stack_valid:
        b               fleh_dispatch64
 
 el1_sp0_irq_vector_long:
+#if XNU_MONITOR && __APRR_SUPPORTED__
+       EL1_SP0_VECTOR_PPL_CHECK el1_sp0_irq_vector_not_in_kernel_mode
+Lel1_sp0_irq_vector_kernel:
+#endif
        EL1_SP0_VECTOR
        mrs             x1, TPIDR_EL1
        ldr             x1, [x1, ACT_CPUDATAP]
@@ -272,6 +458,10 @@ el1_sp0_irq_vector_long:
 
 el1_sp0_fiq_vector_long:
        // ARM64_TODO write optimized decrementer
+#if XNU_MONITOR && __APRR_SUPPORTED__
+       EL1_SP0_VECTOR_PPL_CHECK el1_sp0_fiq_vector_not_in_kernel_mode
+Lel1_sp0_fiq_vector_kernel:
+#endif
        EL1_SP0_VECTOR
        mrs             x1, TPIDR_EL1
        ldr             x1, [x1, ACT_CPUDATAP]
@@ -282,6 +472,10 @@ el1_sp0_fiq_vector_long:
        b               fleh_dispatch64
 
 el1_sp0_serror_vector_long:
+#if XNU_MONITOR && __APRR_SUPPORTED__
+       EL1_SP0_VECTOR_PPL_CHECK el1_sp0_serror_vector_not_in_kernel_mode
+Lel1_sp0_serror_vector_kernel:
+#endif
        EL1_SP0_VECTOR
        adrp    x1, EXT(fleh_serror)@page                               // Load address for fleh
        add             x1, x1, EXT(fleh_serror)@pageoff
@@ -417,6 +611,13 @@ el0_serror_vector_64_long:
        add             x1, x1, EXT(fleh_serror)@pageoff
        b               fleh_dispatch64
 
+#if XNU_MONITOR && __APRR_SUPPORTED__
+el1_sp0_synchronous_vector_ppl_check:
+       EL1_SP0_VECTOR_PPL_CHECK el1_sp0_synchronous_vector_not_in_kernel_mode
+
+       /* Jump back to the primary exception vector if we fell through. */
+       b               Lel1_sp0_synchronous_vector_kernel
+#endif
 
 /*
  * check_exception_stack
@@ -525,7 +726,7 @@ check_ktrr_sctlr_trap:
        b.ne    Lel1_sp1_synchronous_vector_continue
        msr             ELR_EL1, lr                                     // Return to caller
        eret
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
 
 /* 64-bit first level exception handler dispatcher.
  * Completes register context saving and branches to FLEH.
@@ -571,7 +772,9 @@ fleh_dispatch64:
        mov             x23, #0
        mov             x24, #0
        mov             x25, #0
+#if !XNU_MONITOR
        mov             x26, #0
+#endif
        mov             x27, #0
        mov             x28, #0
        /* fp/lr already cleared by EL0_64_VECTOR */
@@ -580,6 +783,10 @@ fleh_dispatch64:
        mov             x21, x0                                                         // Copy arm_context_t pointer to x21
        mov             x22, x1                                                         // Copy handler routine to x22
 
+#if XNU_MONITOR
+       /* Zero x26 to indicate that this should not return to the PPL. */
+       mov             x26, #0
+#endif
 
 #if    !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME
        tst             x23, PSR64_MODE_EL_MASK                         // If any EL MODE bits are set, we're coming from
@@ -620,6 +827,9 @@ Lvalid_link_register:
        bl              EXT(sleh_synchronous)
        POP_FRAME
 
+#if XNU_MONITOR
+       CHECK_EXCEPTION_RETURN_DISPATCH_PPL
+#endif
 
        b               exception_return_dispatch
 
@@ -691,6 +901,9 @@ LEXT(fleh_irq)
        POP_FRAME
        END_INTERRUPT_HANDLER
 
+#if XNU_MONITOR
+       CHECK_EXCEPTION_RETURN_DISPATCH_PPL
+#endif
 
        b               exception_return_dispatch
 
@@ -710,6 +923,9 @@ LEXT(fleh_fiq)
        POP_FRAME
        END_INTERRUPT_HANDLER
 
+#if XNU_MONITOR
+       CHECK_EXCEPTION_RETURN_DISPATCH_PPL
+#endif
 
        b               exception_return_dispatch
 
@@ -724,6 +940,9 @@ LEXT(fleh_serror)
        bl              EXT(sleh_serror)
        POP_FRAME
 
+#if XNU_MONITOR
+       CHECK_EXCEPTION_RETURN_DISPATCH_PPL
+#endif
 
        b               exception_return_dispatch
 
@@ -1048,6 +1267,18 @@ user_take_ast:
 
 user_set_debug_state_and_return:
 
+#if defined(APPLELIGHTNING)
+/* rdar://53177964 ([Cebu Errata SW WA][v8Debug] MDR NEX L3 clock turns OFF during restoreCheckpoint due to SWStep getting masked) */
+
+       ARM64_IS_PCORE x12                                  // if we're not a pCORE, also do nothing
+       cbz             x12, 1f
+
+       mrs             x12, ARM64_REG_HID1                         // if any debug session ever existed, set forceNexL3ClkOn
+       orr             x12, x12, ARM64_REG_HID1_forceNexL3ClkOn
+       msr             ARM64_REG_HID1, x12
+1:
+
+#endif
 
        ldr             x4, [x3, ACT_CPUDATAP]                          // Get current CPU data pointer
        isb                                                                                     // Synchronize context
@@ -1111,11 +1342,577 @@ L_preempt_count_notzero_str:
 LEXT(ExceptionVectorsEnd)
 #endif /* __ARM_KERNEL_PROTECT__ */
 
+#if XNU_MONITOR
+#if __APRR_SUPPORTED__
+       .text
+       .align 2
+el1_sp0_synchronous_vector_not_in_kernel_mode:
+       EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE Lel1_sp0_synchronous_vector_kernel, fleh_synchronous_from_ppl, STAY_ON_SP1
+
+       .text
+       .align 2
+el1_sp0_fiq_vector_not_in_kernel_mode:
+       EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE Lel1_sp0_fiq_vector_kernel, fleh_fiq_from_ppl, SWITCH_TO_SP0
+
+       .text
+       .align 2
+el1_sp0_irq_vector_not_in_kernel_mode:
+       EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE Lel1_sp0_irq_vector_kernel, fleh_irq_from_ppl, SWITCH_TO_SP0
+
+       .text
+       .align 2
+el1_sp0_serror_vector_not_in_kernel_mode:
+       EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE Lel1_sp0_serror_vector_kernel, fleh_serror_from_ppl, SWITCH_TO_SP0
+#endif /* __APRR_SUPPORTED__ */
+
+/*
+ * Functions to preflight the fleh handlers when the PPL has taken an exception;
+ * mostly concerned with setting up state for the normal fleh code.
+ */
+fleh_synchronous_from_ppl:
+       /* Save x0. */
+       mov             x15, x0
+
+       /* Grab the ESR. */
+       mrs             x1, ESR_EL1                                                     // Get the exception syndrome
+
+       /* If the stack pointer is corrupt, it will manifest either as a data abort
+        * (syndrome 0x25) or a misaligned pointer (syndrome 0x26). We can check
+        * these quickly by testing bit 5 of the exception class.
+        */
+       tbz             x1, #(5 + ESR_EC_SHIFT), Lvalid_ppl_stack
+       mrs             x0, SP_EL0                                                      // Get SP_EL0
+
+       /* Perform high level checks for stack corruption. */
+       and             x1, x1, #ESR_EC_MASK                            // Mask the exception class
+       mov             x2, #(ESR_EC_SP_ALIGN << ESR_EC_SHIFT)
+       cmp             x1, x2                                                          // If we have a stack alignment exception
+       b.eq    Lcorrupt_ppl_stack                                              // ...the stack is definitely corrupted
+       mov             x2, #(ESR_EC_DABORT_EL1 << ESR_EC_SHIFT)
+       cmp             x1, x2                                                          // If we have a data abort, we need to
+       b.ne    Lvalid_ppl_stack                                                // ...validate the stack pointer
+
+Ltest_pstack:
+       /* Bounds check the PPL stack. */
+       adrp    x10, EXT(pmap_stacks_start)@page
+       ldr             x10, [x10, #EXT(pmap_stacks_start)@pageoff]
+       adrp    x11, EXT(pmap_stacks_end)@page
+       ldr             x11, [x11, #EXT(pmap_stacks_end)@pageoff]
+       cmp             x0, x10
+       b.lo    Lcorrupt_ppl_stack
+       cmp             x0, x11
+       b.hi    Lcorrupt_ppl_stack
+
+Lvalid_ppl_stack:
+       /* Restore x0. */
+       mov             x0, x15
+
+       /* Switch back to the kernel stack. */
+       msr             SPSel, #0
+       GET_PMAP_CPU_DATA x5, x6, x7
+       ldr             x6, [x5, PMAP_CPU_DATA_KERN_SAVED_SP]
+       mov             sp, x6
+
+       /* Hand off to the synch handler. */
+       b               EXT(fleh_synchronous)
+
+Lcorrupt_ppl_stack:
+       /* Restore x0. */
+       mov             x0, x15
+
+       /* Hand off to the invalid stack handler. */
+       b               fleh_invalid_stack
+
+fleh_fiq_from_ppl:
+       mrs             x1, TPIDR_EL1
+       ldr             x1, [x1, ACT_CPUDATAP]
+       ldr             x1, [x1, CPU_ISTACKPTR]
+       mov             sp, x1
+       b               EXT(fleh_fiq)
+
+fleh_irq_from_ppl:
+       mrs             x1, TPIDR_EL1
+       ldr             x1, [x1, ACT_CPUDATAP]
+       ldr             x1, [x1, CPU_ISTACKPTR]
+       mov             sp, x1
+       b               EXT(fleh_irq)
+
+fleh_serror_from_ppl:
+       GET_PMAP_CPU_DATA x5, x6, x7
+       ldr             x6, [x5, PMAP_CPU_DATA_KERN_SAVED_SP]
+       mov             sp, x6
+       b               EXT(fleh_serror)
+
+/*
+ * REENABLE_DAIF
+ *
+ * Restores the DAIF bits to their original state (well, the AIF bits at least).
+ *   arg0 - DAIF bits (read from the DAIF interface) to restore
+ */
+.macro REENABLE_DAIF
+       /* AIF enable. */
+       tst             $0, #(DAIF_IRQF | DAIF_FIQF | DAIF_ASYNCF)
+       b.eq            3f
+
+       /* IF enable. */
+       tst             $0, #(DAIF_IRQF | DAIF_FIQF)
+       b.eq            2f
+
+       /* A enable. */
+       tst             $0, #(DAIF_ASYNCF)
+       b.eq            1f
+
+       /* Enable nothing. */
+       b               4f
+
+       /* A enable. */
+1:
+       msr             DAIFClr, #(DAIFSC_ASYNCF)
+       b               4f
+
+       /* IF enable. */
+2:
+       msr             DAIFClr, #(DAIFSC_IRQF | DAIFSC_FIQF)
+       b               4f
+
+       /* AIF enable. */
+3:
+       msr             DAIFClr, #(DAIFSC_IRQF | DAIFSC_FIQF | DAIFSC_ASYNCF)
+
+       /* Done! */
+4:
+.endmacro
+
+
+#if XNU_MONITOR && __APRR_SUPPORTED__
+/*
+ * aprr_ppl_enter
+ *
+ * Invokes the PPL
+ *   x15 - The index of the requested PPL function.
+ */
+       .text
+       .align 2
+       .globl EXT(aprr_ppl_enter)
+LEXT(aprr_ppl_enter)
+       /* Push a frame. */
+       ARM64_STACK_PROLOG
+       stp             x20, x21, [sp, #-0x20]!
+       stp             x29, x30, [sp, #0x10]
+       add             x29, sp, #0x10
+
+       /* Increase the preemption count. */
+       mrs             x10, TPIDR_EL1
+       ldr             w12, [x10, ACT_PREEMPT_CNT]
+       add             w12, w12, #1
+       str             w12, [x10, ACT_PREEMPT_CNT]
+
+       /* Is the PPL currently locked down? */
+       adrp            x13, EXT(pmap_ppl_locked_down)@page
+       add             x13, x13, EXT(pmap_ppl_locked_down)@pageoff
+       ldr             w14, [x13]
+       cmp             w14, wzr
+
+       /* If not, just perform the call in the current context. */
+       b.eq            EXT(ppl_bootstrap_dispatch)
+
+       mov             w10, #PPL_STATE_KERNEL
+       b               Ldisable_aif_and_enter_ppl
+
+       /* We align this to land the next few instructions on their own page. */
+       .section __PPLTRAMP,__text,regular,pure_instructions
+       .align 14
+       .space (16*1024)-(4*8) // 8 insns
+
+       /*
+        * This label is used by exception handlers that are trying to return
+        * to the PPL.
+        */
+Ldisable_aif_and_enter_ppl:
+       /* We must trampoline to the PPL context; disable AIF. */
+       mrs             x20, DAIF
+       msr             DAIFSet, #(DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF)
+
+       .globl EXT(ppl_no_exception_start)
+LEXT(ppl_no_exception_start)
+       /* Switch APRR_EL1 to PPL mode. */
+       MOV64   x14, APRR_EL1_PPL
+       msr             APRR_EL1, x14
+
+       /* This ISB should be the last instruction on a page. */
+       // TODO: can we static assert this?
+       isb
+#endif /* XNU_MONITOR && __APRR_SUPPORTED__ */
+
+
+       // x15: ppl call number
+       // w10: ppl_state
+       // x20: gxf_enter caller's DAIF
+       .globl EXT(ppl_trampoline_start)
+LEXT(ppl_trampoline_start)
+
+#if __APRR_SUPPORTED__
+       /* Squash AIF AGAIN, because someone may have attacked us. */
+       msr             DAIFSet, #(DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF)
+#endif /* __APRR_SUPPORTED__ */
+
+#if __APRR_SUPPORTED__
+       /* Verify the state of APRR_EL1. */
+       MOV64   x14, APRR_EL1_PPL
+       mrs             x21, APRR_EL1
+#else /* __APRR_SUPPORTED__ */
+#error "XPRR configuration error"
+#endif /* __APRR_SUPPORTED__ */
+       cmp             x14, x21
+       b.ne    Lppl_fail_dispatch
+
+       /* Verify the request ID. */
+       cmp             x15, PMAP_COUNT
+       b.hs    Lppl_fail_dispatch
+
+       /* Get the PPL CPU data structure. */
+       GET_PMAP_CPU_DATA       x12, x13, x14
+
+       /* Mark this CPU as being in the PPL. */
+       ldr             w9, [x12, PMAP_CPU_DATA_PPL_STATE]
+
+       cmp             w9, #PPL_STATE_KERNEL
+       b.eq            Lppl_mark_cpu_as_dispatching
+
+       /* Check to see if we are trying to trap from within the PPL. */
+       cmp             w9, #PPL_STATE_DISPATCH
+       b.eq            Lppl_fail_dispatch_ppl
+
+
+       /* Ensure that we are returning from an exception. */
+       cmp             w9, #PPL_STATE_EXCEPTION
+       b.ne            Lppl_fail_dispatch
+
+       // where is w10 set?
+       // in CHECK_EXCEPTION_RETURN_DISPATCH_PPL
+       cmp             w10, #PPL_STATE_EXCEPTION
+       b.ne            Lppl_fail_dispatch
+
+       /* This is an exception return; set the CPU to the dispatching state. */
+       mov             w9, #PPL_STATE_DISPATCH
+       str             w9, [x12, PMAP_CPU_DATA_PPL_STATE]
+
+       /* Find the save area, and return to the saved PPL context. */
+       ldr             x0, [x12, PMAP_CPU_DATA_SAVE_AREA]
+       mov             sp, x0
+#if __APRR_SUPPORTED__
+       b               Lexception_return_restore_registers
+#else
+       b               EXT(return_to_ppl)
+#endif /* __APRR_SUPPORTED__ */
+
+Lppl_mark_cpu_as_dispatching:
+       cmp             w10, #PPL_STATE_KERNEL
+       b.ne            Lppl_fail_dispatch
+
+       /* Mark the CPU as dispatching. */
+       mov             w13, #PPL_STATE_DISPATCH
+       str             w13, [x12, PMAP_CPU_DATA_PPL_STATE]
+
+       /* Get the handler for the request */
+       adrp    x9, EXT(ppl_handler_table)@page
+       add             x9, x9, EXT(ppl_handler_table)@pageoff
+       ldr             x10, [x9, x15, lsl #3]
+
+       /* Switch to the regular PPL stack. */
+       // TODO: switch to PPL_STACK earlier in gxf_ppl_entry_handler
+       ldr             x9, [x12, PMAP_CPU_DATA_PPL_STACK]
+
+       // SP0 is thread stack here
+       mov             x21, sp
+       // SP0 is now PPL stack
+       mov             sp, x9
+
+
+       /* Save the old stack pointer off in case we need it. */
+       str             x21, [x12, PMAP_CPU_DATA_KERN_SAVED_SP]
+
+       /* Branch to the code that will invoke the PPL request. */
+       b               EXT(ppl_dispatch)
+
+Lppl_fail_dispatch_ppl:
+       /* Switch back to the kernel stack. */
+       ldr             x10, [x12, PMAP_CPU_DATA_KERN_SAVED_SP]
+       mov             sp, x10
+
+Lppl_fail_dispatch:
+       /* Indicate that we failed. */
+       mov             x15, #PPL_EXIT_BAD_CALL
+
+       /* Move the DAIF bits into the expected register. */
+       mov             x10, x20
+
+       /* Return to kernel mode. */
+       b               ppl_return_to_kernel_mode
+
+Lppl_dispatch_exit:
+       /* Indicate that we are cleanly exiting the PPL. */
+       mov             x15, #PPL_EXIT_DISPATCH
+
+       /* Switch back to the original (kernel thread) stack. */
+       mov             sp, x21
+
+       /* Move the saved DAIF bits. */
+       mov             x10, x20
+
+       /* Clear the old stack pointer. */
+       str             xzr, [x12, PMAP_CPU_DATA_KERN_SAVED_SP]
+
+       /*
+        * Mark the CPU as no longer being in the PPL.  We spin if our state
+        * machine is broken.
+        */
+       ldr             w9, [x12, PMAP_CPU_DATA_PPL_STATE]
+       cmp             w9, #PPL_STATE_DISPATCH
+       b.ne            .
+       mov             w9, #PPL_STATE_KERNEL
+       str             w9, [x12, PMAP_CPU_DATA_PPL_STATE]
+
+       /* Return to the kernel. */
+       b ppl_return_to_kernel_mode
+
+#if __APRR_SUPPORTED__
+       /* We align this to land the next few instructions on their own page. */
+       .align 14
+       .space (16*1024)-(4*5) // 5 insns
+
+ppl_return_to_kernel_mode:
+       /* Switch APRR_EL1 back to the kernel mode. */
+       // must be 5 instructions
+       MOV64   x14, APRR_EL1_DEFAULT
+       msr             APRR_EL1, x14
+
+       .globl EXT(ppl_trampoline_end)
+LEXT(ppl_trampoline_end)
+
+       /* This should be the first instruction on a page. */
+       isb
+
+       .globl EXT(ppl_no_exception_end)
+LEXT(ppl_no_exception_end)
+       b ppl_exit
+#endif /* __APRR_SUPPORTED__ */
+
+
+       .text
+ppl_exit:
+       /*
+        * If we are dealing with an exception, hand off to the first level
+        * exception handler.
+        */
+       cmp             x15, #PPL_EXIT_EXCEPTION
+       b.eq    Ljump_to_fleh_handler
+
+       /* Restore the original AIF state. */
+       REENABLE_DAIF   x10
+
+       /* If this was a panic call from the PPL, reinvoke panic. */
+       cmp             x15, #PPL_EXIT_PANIC_CALL
+       b.eq    Ljump_to_panic_trap_to_debugger
+
+       /* Load the preemption count. */
+       mrs             x10, TPIDR_EL1
+       ldr             w12, [x10, ACT_PREEMPT_CNT]
+
+       /* Detect underflow */
+       cbnz    w12, Lno_preempt_underflow
+       b               preempt_underflow
+Lno_preempt_underflow:
+
+       /* Lower the preemption count. */
+       sub             w12, w12, #1
+       str             w12, [x10, ACT_PREEMPT_CNT]
+
+       /* Skip ASTs if the peemption count is not zero. */
+       cbnz    x12, Lppl_skip_ast_taken
+
+       /* Skip the AST check if interrupts are disabled. */
+       mrs             x1, DAIF
+       tst     x1, #DAIF_IRQF
+       b.ne    Lppl_skip_ast_taken
+
+       /* Disable interrupts. */
+       msr             DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF)
+
+       /* IF there is no urgent AST, skip the AST. */
+       ldr             x12, [x10, ACT_CPUDATAP]
+       ldr             x14, [x12, CPU_PENDING_AST]
+       tst             x14, AST_URGENT
+       b.eq    Lppl_defer_ast_taken
+
+       /* Stash our return value and return reason. */
+       mov             x20, x0
+       mov             x21, x15
+
+       /* Handle the AST. */
+       bl              EXT(ast_taken_kernel)
+
+       /* Restore the return value and the return reason. */
+       mov             x15, x21
+       mov             x0, x20
+
+Lppl_defer_ast_taken:
+       /* Reenable interrupts. */
+       msr             DAIFClr, #(DAIFSC_IRQF | DAIFSC_FIQF)
+
+Lppl_skip_ast_taken:
+       /* Pop the stack frame. */
+       ldp             x29, x30, [sp, #0x10]
+       ldp             x20, x21, [sp], #0x20
+
+       /* Check to see if this was a bad request. */
+       cmp             x15, #PPL_EXIT_BAD_CALL
+       b.eq    Lppl_bad_call
+
+       /* Return. */
+       ARM64_STACK_EPILOG
+
+       .align 2
+Ljump_to_fleh_handler:
+       br      x25
+
+       .align 2
+Ljump_to_panic_trap_to_debugger:
+       b               EXT(panic_trap_to_debugger)
+
+Lppl_bad_call:
+       /* Panic. */
+       adrp    x0, Lppl_bad_call_panic_str@page
+       add             x0, x0, Lppl_bad_call_panic_str@pageoff
+       b               EXT(panic)
+
+       .text
+       .align 2
+       .globl EXT(ppl_dispatch)
+LEXT(ppl_dispatch)
+       /*
+        * Save a couple of important registers (implementation detail; x12 has
+        * the PPL per-CPU data address; x13 is not actually interesting).
+        */
+       stp             x12, x13, [sp, #-0x10]!
+
+       /* Restore the original AIF state. */
+       REENABLE_DAIF   x20
+
+       /*
+        * Note that if the method is NULL, we'll blow up with a prefetch abort,
+        * but the exception vectors will deal with this properly.
+        */
+
+       /* Invoke the PPL method. */
+#ifdef HAS_APPLE_PAC
+       blraaz          x10
+#else
+       blr             x10
+#endif
+
+       /* Disable AIF. */
+       msr             DAIFSet, #(DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF)
+
+       /* Restore those important registers. */
+       ldp             x12, x13, [sp], #0x10
+
+       /* Mark this as a regular return, and hand off to the return path. */
+       b               Lppl_dispatch_exit
+
+       .text
+       .align 2
+       .globl EXT(ppl_bootstrap_dispatch)
+LEXT(ppl_bootstrap_dispatch)
+       /* Verify the PPL request. */
+       cmp             x15, PMAP_COUNT
+       b.hs    Lppl_fail_bootstrap_dispatch
+
+       /* Get the requested PPL routine. */
+       adrp    x9, EXT(ppl_handler_table)@page
+       add             x9, x9, EXT(ppl_handler_table)@pageoff
+       ldr             x10, [x9, x15, lsl #3]
+
+       /* Invoke the requested PPL routine. */
+#ifdef HAS_APPLE_PAC
+       blraaz          x10
+#else
+       blr             x10
+#endif
+       /* Stash off the return value */
+       mov             x20, x0
+       /* Drop the preemption count */
+       bl              EXT(_enable_preemption)
+       mov             x0, x20
+
+       /* Pop the stack frame. */
+       ldp             x29, x30, [sp, #0x10]
+       ldp             x20, x21, [sp], #0x20
+#if __has_feature(ptrauth_returns)
+       retab
+#else
+       ret
+#endif
+
+Lppl_fail_bootstrap_dispatch:
+       /* Pop our stack frame and panic. */
+       ldp             x29, x30, [sp, #0x10]
+       ldp             x20, x21, [sp], #0x20
+#if __has_feature(ptrauth_returns)
+       autibsp
+#endif
+       adrp    x0, Lppl_bad_call_panic_str@page
+       add             x0, x0, Lppl_bad_call_panic_str@pageoff
+       b               EXT(panic)
+
+       .text
+       .align 2
+       .globl EXT(ml_panic_trap_to_debugger)
+LEXT(ml_panic_trap_to_debugger)
+#if 0
+       // TODO: why would we ever want to turn interrupts back on after going down panic path?
+       /* Grab the current AIF state, and disable AIF. */
+       mrs             x10, DAIF
+#endif
+       msr             DAIFSet, #(DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF)
+
+       // we want interrupts to stay masked after exiting PPL when calling into panic to halt system
+       // x10 is used in ppl_return_to_kernel_mode restore desired DAIF state after GEXIT
+       mrs             x10, DAIF
+
+       /* Indicate (for the PPL->kernel transition) that we are panicking. */
+       mov             x15, #PPL_EXIT_PANIC_CALL
+
+       /* Get the PPL per-CPU data. */
+       GET_PMAP_CPU_DATA       x11, x12, x13
+
+       /* Restore the old stack pointer as we can't push onto PPL stack after we exit PPL */
+       ldr             x12, [x11, PMAP_CPU_DATA_KERN_SAVED_SP]
+       mov             sp, x12
+
+       /*
+        * Mark this CPU as being in the PPL.  Halt and catch fire if our state
+        * machine appears to be broken.
+        */
+       ldr             w12, [x11, PMAP_CPU_DATA_PPL_STATE]
+       cmp             w12, #PPL_STATE_DISPATCH
+       b.ne            .
+       mov             w13, #PPL_STATE_PANIC
+       str             w13, [x11, PMAP_CPU_DATA_PPL_STATE]
+
+       /* Now we are ready to exit the PPL. */
+       b               ppl_return_to_kernel_mode
+
+       .data
+Lppl_bad_call_panic_str:
+       .asciz "ppl_dispatch: failed due to bad arguments/state"
+#else /* XNU_MONITOR */
        .text
        .align 2
        .globl EXT(ml_panic_trap_to_debugger)
 LEXT(ml_panic_trap_to_debugger)
        ret
+#endif /* XNU_MONITOR */
 
 /* ARM64_TODO Is globals_asm.h needed? */
 //#include     "globals_asm.h"
index 13aca14c10786b4251c8dde006b52e8b99a91e78..037f34c135fad0fccb42360b36a737a20397e199 100644 (file)
@@ -56,7 +56,7 @@
 
 #include <IOKit/IOPlatformExpert.h>
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
 #include <libkern/kernel_mach_header.h>
 #endif
 
@@ -66,6 +66,9 @@
 #include <kern/kpc.h>
 #endif
 
+#if HAS_CLUSTER
+static uint8_t cluster_initialized = 0;
+#endif
 
 
 static int max_cpus_initialized = 0;
@@ -90,6 +93,11 @@ extern vm_offset_t   segLOWESTTEXT;
 extern vm_offset_t   segLASTB;
 extern unsigned long segSizeLAST;
 
+#if defined(HAS_IPI)
+unsigned int gFastIPI = 1;
+#define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
+static uint64_t deferred_ipi_timer_ns = kDeferredIPITimerDefault;
+#endif /* defined(HAS_IPI) */
 
 void machine_conf(void);
 
@@ -113,40 +121,112 @@ void ml_lockdown_init(void);
 void ml_lockdown_run_handler(void);
 uint32_t get_arm_cpu_version(void);
 
+#if defined(HAS_IPI)
+static inline void
+ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
+{
+#if HAS_CLUSTER
+       uint64_t local_mpidr;
+       /* NOTE: this logic expects that we are called in a non-preemptible
+        * context, or at least one in which the calling thread is bound
+        * to a single CPU.  Otherwise we may migrate between choosing which
+        * IPI mechanism to use and issuing the IPI. */
+       MRS(local_mpidr, "MPIDR_EL1");
+       if ((local_mpidr & MPIDR_AFF1_MASK) == (cpu_mpidr & MPIDR_AFF1_MASK)) {
+               uint64_t x = type | (cpu_mpidr & MPIDR_AFF0_MASK);
+               MSR(ARM64_REG_IPI_RR_LOCAL, x);
+       } else {
+               #define IPI_RR_TARGET_CLUSTER_SHIFT 16
+               uint64_t x = type | ((cpu_mpidr & MPIDR_AFF1_MASK) << (IPI_RR_TARGET_CLUSTER_SHIFT - MPIDR_AFF1_SHIFT)) | (cpu_mpidr & MPIDR_AFF0_MASK);
+               MSR(ARM64_REG_IPI_RR_GLOBAL, x);
+       }
+#else
+       uint64_t x = type | (cpu_mpidr & MPIDR_AFF0_MASK);
+       MSR(ARM64_REG_IPI_RR, x);
+#endif
+}
+#endif
 
+#if !defined(HAS_IPI)
 __dead2
+#endif
 void
 ml_cpu_signal(unsigned int cpu_mpidr __unused)
 {
+#if defined(HAS_IPI)
+       ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
+#else
        panic("Platform does not support ACC Fast IPI");
+#endif
 }
 
+#if !defined(HAS_IPI)
 __dead2
+#endif
 void
 ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
 {
+#if defined(HAS_IPI)
+       /* adjust IPI_CR timer countdown value for deferred IPI
+        * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
+        * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
+        *
+        * global register, should only require a single write to update all
+        * CPU cores: from Skye ACC user spec section 5.7.3.3
+        *
+        * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
+        * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
+        */
+       uint64_t abstime;
+
+       nanoseconds_to_absolutetime(nanosecs, &abstime);
+
+       abstime = MIN(abstime, 0xFFFF);
+
+       /* update deferred_ipi_timer_ns with the new clamped value */
+       absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
+
+       MSR(ARM64_REG_IPI_CR, abstime);
+#else
        (void)nanosecs;
        panic("Platform does not support ACC Fast IPI");
+#endif
 }
 
 uint64_t
 ml_cpu_signal_deferred_get_timer()
 {
+#if defined(HAS_IPI)
+       return deferred_ipi_timer_ns;
+#else
        return 0;
+#endif
 }
 
+#if !defined(HAS_IPI)
 __dead2
+#endif
 void
 ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
 {
+#if defined(HAS_IPI)
+       ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
+#else
        panic("Platform does not support ACC Fast IPI deferral");
+#endif
 }
 
+#if !defined(HAS_IPI)
 __dead2
+#endif
 void
 ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
 {
+#if defined(HAS_IPI)
+       ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
+#else
        panic("Platform does not support ACC Fast IPI retraction");
+#endif
 }
 
 void
@@ -241,7 +321,11 @@ get_arm_cpu_version(void)
 boolean_t
 user_cont_hwclock_allowed(void)
 {
+#if HAS_CONTINUOUS_HWCLOCK
+       return TRUE;
+#else
        return FALSE;
+#endif
 }
 
 
@@ -257,7 +341,7 @@ arm64_wfe_allowed(void)
        return TRUE;
 }
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
 
 uint64_t rorgn_begin __attribute__((section("__DATA, __const"))) = 0;
 uint64_t rorgn_end   __attribute__((section("__DATA, __const"))) = 0;
@@ -307,6 +391,11 @@ rorgn_stash_range(void)
        rc = DTGetProperty(entryP, "reg", (void **)&reg_prop, &prop_size);
        assert(rc == kSuccess);
        amcc_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1));
+#elif defined(KERNEL_INTEGRITY_CTRR)
+       /* TODO: t8020 mcc entry not in device tree yet; we'll do it LIVE */
+#define TEMP_AMCC_BASE_PA 0x200000000ULL
+#define TEMP_AMCC_SZ      0x100000
+       amcc_base = ml_io_map(TEMP_AMCC_BASE_PA, TEMP_AMCC_SZ);
 #else
 #error "KERNEL_INTEGRITY config error"
 #endif
@@ -315,6 +404,27 @@ rorgn_stash_range(void)
        assert(rRORGNENDADDR > rRORGNBASEADDR);
        rorgn_begin = (rRORGNBASEADDR << AMCC_PGSHIFT) + dram_base;
        rorgn_end   = (rRORGNENDADDR << AMCC_PGSHIFT) + dram_base;
+#elif defined(KERNEL_INTEGRITY_CTRR)
+       rorgn_begin = rCTRR_AMCC_PLANE_REG(0, CTRR_A_BASEADDR);
+       rorgn_end   = rCTRR_AMCC_PLANE_REG(0, CTRR_A_ENDADDR);
+       assert(rorgn_end > rorgn_begin);
+
+       for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) {
+               uint32_t begin = rCTRR_AMCC_PLANE_REG(i, CTRR_A_BASEADDR);
+               uint32_t end = rCTRR_AMCC_PLANE_REG(i, CTRR_A_ENDADDR);
+               if (!(begin == rorgn_begin && end == rorgn_end)) {
+#if DEVELOPMENT || DEBUG
+                       panic("iboot programmed CTRR bounds are inconsistent");
+#else
+                       panic("Inconsistent memory configuration");
+#endif
+               }
+       }
+
+       // convert from page number from DRAM base to PA
+       rorgn_begin = (rorgn_begin << AMCC_PGSHIFT) + dram_base;
+       rorgn_end   = (rorgn_end << AMCC_PGSHIFT) + dram_base;
+
 #else
 #error KERNEL_INTEGRITY config error
 #endif /* defined (KERNEL_INTEGRITY_KTRR) */
@@ -330,6 +440,11 @@ assert_unlocked()
 #if defined(KERNEL_INTEGRITY_KTRR)
        rorgn_lock = rRORGNLOCK;
        ktrr_lock = __builtin_arm_rsr64(ARM64_REG_KTRR_LOCK_EL1);
+#elif defined(KERNEL_INTEGRITY_CTRR)
+       for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) {
+               rorgn_lock |= rCTRR_AMCC_PLANE_REG(i, CTRR_A_LOCK);
+       }
+       ktrr_lock = __builtin_arm_rsr64(ARM64_REG_CTRR_LOCK_EL1);
 #else
 #error KERNEL_INTEGRITY config error
 #endif /* defined(KERNEL_INTEGRITY_KTRR) */
@@ -344,6 +459,13 @@ lock_amcc()
 #if defined(KERNEL_INTEGRITY_KTRR)
        rRORGNLOCK = 1;
        __builtin_arm_isb(ISB_SY);
+#elif defined(KERNEL_INTEGRITY_CTRR)
+       /* lockdown planes in reverse order as plane 0 should be locked last */
+       for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) {
+               rCTRR_AMCC_PLANE_REG(CTRR_AMCC_MAX_PLANES - i - 1, CTRR_A_ENABLE) = 1;
+               rCTRR_AMCC_PLANE_REG(CTRR_AMCC_MAX_PLANES - i - 1, CTRR_A_LOCK) = 1;
+               __builtin_arm_isb(ISB_SY);
+       }
 #else
 #error KERNEL_INTEGRITY config error
 #endif
@@ -363,6 +485,37 @@ lock_mmu(uint64_t begin, uint64_t end)
        __builtin_arm_isb(ISB_SY);
        flush_mmu_tlb();
 
+#elif defined (KERNEL_INTEGRITY_CTRR)
+       /* this will lock the entire bootstrap cluster. non bootstrap clusters
+        * will be locked by respective cluster master in start.s */
+
+       __builtin_arm_wsr64(ARM64_REG_CTRR_A_LWR_EL1, begin);
+       __builtin_arm_wsr64(ARM64_REG_CTRR_A_UPR_EL1, end);
+
+#if !defined(APPLEVORTEX)
+       /* H12 changed sequence, must invalidate TLB immediately after setting CTRR bounds */
+       __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */
+       flush_mmu_tlb();
+#endif /* !defined(APPLEVORTEX) */
+
+       __builtin_arm_wsr64(ARM64_REG_CTRR_CTL_EL1, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT);
+       __builtin_arm_wsr64(ARM64_REG_CTRR_LOCK_EL1, 1ULL);
+
+       uint64_t current_el = __builtin_arm_rsr64("CurrentEL");
+       if (current_el == PSR64_MODE_EL2) {
+               // CTRR v2 has explicit registers for cluster config. they can only be written in EL2
+
+               __builtin_arm_wsr64(ACC_CTRR_A_LWR_EL2, begin);
+               __builtin_arm_wsr64(ACC_CTRR_A_UPR_EL2, end);
+               __builtin_arm_wsr64(ACC_CTRR_CTL_EL2, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT);
+               __builtin_arm_wsr64(ACC_CTRR_LOCK_EL2, 1ULL);
+       }
+
+       __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */
+#if defined(APPLEVORTEX)
+       flush_mmu_tlb();
+#endif /* defined(APPLEVORTEX) */
+
 #else /* defined(KERNEL_INTEGRITY_KTRR) */
 #error KERNEL_INTEGRITY config error
 #endif /* defined(KERNEL_INTEGRITY_KTRR) */
@@ -373,6 +526,17 @@ assert_amcc_cache_disabled()
 {
 #if defined(KERNEL_INTEGRITY_KTRR)
        assert((rMCCGEN & 1) == 0); /* assert M$ disabled or LLC clean will be unreliable */
+#elif defined(KERNEL_INTEGRITY_CTRR) && (defined(ARM64_BOARD_CONFIG_T8006))
+       /*
+        * T8006 differentiates between data and tag ways being powered up, so
+        * make sure to check that both are zero on its single memory plane.
+        */
+       assert((rCTRR_AMCC_PLANE_REG(0, CTRR_AMCC_PWRONWAYCNTSTATUS) &
+           (AMCC_CURTAGWAYCNT_MASK | AMCC_CURDATWAYCNT_MASK)) == 0);
+#elif defined (KERNEL_INTEGRITY_CTRR)
+       for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) {
+               assert(rCTRR_AMCC_PLANE_REG(i, CTRR_AMCC_WAYONCNT) == 0);
+       }
 #else
 #error KERNEL_INTEGRITY config error
 #endif
@@ -423,6 +587,11 @@ rorgn_lockdown(void)
        assert(rorgn_begin == ktrr_begin && rorgn_end == (ktrr_end + last_segsz));
        /* assert that __LAST segment containing privileged insns is only a single page */
        assert(last_segsz == PAGE_SIZE);
+#elif defined(KERNEL_INTEGRITY_CTRR)
+       ktrr_end = (ktrr_end + last_segsz - 1) & ~AMCC_PGMASK;
+       /* __LAST is part of MMU CTRR region. Can't use the KTRR style method of making
+        * __pinst no execute because PXN applies with MMU off in CTRR. */
+       assert(rorgn_begin == ktrr_begin && rorgn_end == ktrr_end);
 #endif
 
 
@@ -445,17 +614,38 @@ rorgn_lockdown(void)
 out:
 #endif
 
+#if defined(KERNEL_INTEGRITY_CTRR)
+       {
+               /* wake any threads blocked on cluster master lockdown */
+               cpu_data_t *cdp;
+               uint64_t mpidr_el1_value;
+
+               cdp = getCpuDatap();
+               MRS(mpidr_el1_value, "MPIDR_EL1");
+               cdp->cpu_cluster_id = (mpidr_el1_value & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT;
+               assert(cdp->cpu_cluster_id < __ARM_CLUSTER_COUNT__);
+               ctrr_cluster_locked[cdp->cpu_cluster_id] = 1;
+               thread_wakeup(&ctrr_cluster_locked[cdp->cpu_cluster_id]);
+       }
+#endif
        /* now we can run lockdown handler */
        ml_lockdown_run_handler();
 }
 
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
 
 void
 machine_startup(__unused boot_args * args)
 {
        int boot_arg;
 
+#if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
+       if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
+               gFastIPI = 1;
+       }
+
+       PE_parse_boot_argn("fastipitimeout", &deferred_ipi_timer_ns, sizeof(deferred_ipi_timer_ns));
+#endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
 
 #if CONFIG_NONFATAL_ASSERTS
        PE_parse_boot_argn("assert", &mach_assert, sizeof(mach_assert));
@@ -484,7 +674,7 @@ machine_lockdown_preflight(void)
 {
 #if CONFIG_KERNEL_INTEGRITY
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        rorgn_stash_range();
 #endif
 
@@ -508,8 +698,11 @@ machine_lockdown(void)
 #endif
 #endif /* KERNEL_INTEGRITY_WT */
 
+#if XNU_MONITOR
+       pmap_lockdown_ppl();
+#endif
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        /* KTRR
         *
         * Lock physical KTRR region. KTRR region is read-only. Memory outside
@@ -517,7 +710,7 @@ machine_lockdown(void)
         */
 
        rorgn_lockdown();
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
 
 
 #endif /* CONFIG_KERNEL_INTEGRITY */
@@ -819,6 +1012,16 @@ ml_install_interrupt_handler(
 void
 ml_init_interrupt(void)
 {
+#if defined(HAS_IPI)
+       /*
+        * ml_init_interrupt will get called once for each CPU, but this is redundant
+        * because there is only one global copy of the register for skye. do it only
+        * on the bootstrap cpu
+        */
+       if (getCpuDatap()->cluster_master) {
+               ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
+       }
+#endif
 }
 
 /*
@@ -960,6 +1163,9 @@ ml_lockdown_init()
 
        lck_mtx_init(&lockdown_handler_lck, lockdown_handler_grp, NULL);
 
+#if defined(KERNEL_INTEGRITY_CTRR)
+       init_ctrr_cpu_start_lock();
+#endif
 }
 
 kern_return_t
@@ -973,7 +1179,7 @@ ml_lockdown_handler_register(lockdown_handler_t f, void *this)
        lockdown_handler = f;
        lockdown_this = this;
 
-#if !(defined(KERNEL_INTEGRITY_KTRR))
+#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR))
        lockdown_done = 1;
        lockdown_handler(this);
 #else
@@ -1063,7 +1269,11 @@ ml_processor_register(ml_processor_info_t *in_processor_info,
        this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
        this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
 
+#if HAS_CLUSTER
+       this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
+#else /* HAS_CLUSTER */
        this_cpu_datap->cluster_master = is_boot_cpu;
+#endif /* HAS_CLUSTER */
 
        pset = pset_find(in_processor_info->cluster_id, processor_pset(master_processor));
        assert(pset != NULL);
@@ -1288,6 +1498,10 @@ ml_static_protect(
                        pt_entry_t      *pte_p;
                        pt_entry_t      ptmp;
 
+#if XNU_MONITOR
+                       assert(!TEST_PAGE_RATIO_4);
+                       assert(!pmap_is_monitor(ppn));
+#endif
 
                        tte2 = arm_kva_to_tte(vaddr_cur);
 
@@ -1668,6 +1882,13 @@ boolean_t
 ml_wants_panic_trap_to_debugger(void)
 {
        boolean_t result = FALSE;
+#if XNU_MONITOR
+       /*
+        * This looks racey, but if we are in the PPL, preemption will be
+        * disabled.
+        */
+       result = ((pmap_get_cpu_data()->ppl_state == PPL_STATE_DISPATCH) && pmap_ppl_locked_down);
+#endif
        return result;
 }
 
index 5dc6cde731106afb85392ade5d0c675874c826f0..9d41431fe6b7da750a185a7b44ac88fbe027cf9a 100644 (file)
@@ -58,7 +58,62 @@ LEXT(ml_set_kernelkey_enabled)
 
 #endif /* defined(HAS_APPLE_PAC) */
 
+#if HAS_BP_RET
 
+/*
+ * void set_bp_ret(void)
+ * Helper function to enable branch predictor state retention
+ * across ACC sleep
+ */
+
+       .align 2
+       .globl EXT(set_bp_ret)
+LEXT(set_bp_ret)
+       // Load bpret boot-arg
+       adrp            x14, EXT(bp_ret)@page
+       add             x14, x14, EXT(bp_ret)@pageoff
+       ldr             w14, [x14]
+
+       mrs             x13, ARM64_REG_ACC_CFG
+       and             x13, x13, (~(ARM64_REG_ACC_CFG_bpSlp_mask << ARM64_REG_ACC_CFG_bpSlp_shift))
+       and             x14, x14, #(ARM64_REG_ACC_CFG_bpSlp_mask)
+       orr             x13, x13, x14, lsl #(ARM64_REG_ACC_CFG_bpSlp_shift)
+       msr             ARM64_REG_ACC_CFG, x13
+
+       ret
+#endif // HAS_BP_RET
+
+#if HAS_NEX_PG
+       .align 2
+       .globl EXT(set_nex_pg)
+LEXT(set_nex_pg)
+       mrs             x14, MPIDR_EL1
+       // Skip if this isn't a p-core; NEX powergating isn't available for e-cores
+       and             x14, x14, #(MPIDR_PNE)
+       cbz             x14, Lnex_pg_done
+
+       // Set the SEG-recommended value of 12 additional reset cycles
+       mrs             x14, ARM64_REG_HID13
+       and             x14, x14, (~ARM64_REG_HID13_RstCyc_mask)
+       orr             x14, x14, ARM64_REG_HID13_RstCyc_val
+       msr             ARM64_REG_HID13, x14
+
+       // Load nexpg boot-arg
+       adrp            x14, EXT(nex_pg)@page
+       add             x14, x14, EXT(nex_pg)@pageoff
+       ldr             w14, [x14]
+
+       mrs             x13, ARM64_REG_HID14
+       and             x13, x13, (~ARM64_REG_HID14_NexPwgEn)
+       cbz             w14, Lset_nex_pg
+       orr             x13, x13, ARM64_REG_HID14_NexPwgEn
+Lset_nex_pg:
+       msr             ARM64_REG_HID14, x13
+
+Lnex_pg_done:
+       ret
+
+#endif // HAS_NEX_PG
 
 /*     uint32_t get_fpscr(void):
  *             Returns (FPSR | FPCR).
@@ -168,12 +223,21 @@ LEXT(set_mmu_ttb_alternate)
        bl              EXT(pinst_set_ttbr1)
        mov             lr, x1
 #else
+#if defined(HAS_VMSA_LOCK)
+       mrs             x1, ARM64_REG_VMSA_LOCK_EL1
+       and             x1, x1, #(VMSA_LOCK_TTBR1_EL1)
+       cbnz            x1, L_set_locked_reg_panic
+#endif /* defined(HAS_VMSA_LOCK) */
        msr             TTBR1_EL1, x0
 #endif /* defined(KERNEL_INTEGRITY_KTRR) */
        isb             sy
        ret
 
+#if XNU_MONITOR
+       .section __PPLTEXT,__text,regular,pure_instructions
+#else
        .text
+#endif
        .align 2
        .globl EXT(set_mmu_ttb)
 LEXT(set_mmu_ttb)
@@ -211,6 +275,19 @@ LEXT(set_vbar_el1)
 #endif
 #endif /* __ARM_KERNEL_PROTECT__ */
 
+#if defined(HAS_VMSA_LOCK)
+       .text
+       .align 2
+       .globl EXT(vmsa_lock)
+LEXT(vmsa_lock)
+       isb sy
+       mov x1, #(VMSA_LOCK_SCTLR_M_BIT)
+       mov x0, #(VMSA_LOCK_TTBR1_EL1 | VMSA_LOCK_TCR_EL1 | VMSA_LOCK_VBAR_EL1)
+       orr x0, x0, x1
+       msr ARM64_REG_VMSA_LOCK_EL1, x0
+       isb sy
+       ret
+#endif /* defined(HAS_VMSA_LOCK) */
 
 /*
  *     set translation control register
@@ -229,6 +306,12 @@ LEXT(set_tcr)
        bl              EXT(pinst_set_tcr)
        mov             lr, x1
 #else
+#if defined(HAS_VMSA_LOCK)
+       // assert TCR unlocked
+       mrs             x1, ARM64_REG_VMSA_LOCK_EL1
+       and             x1, x1, #(VMSA_LOCK_TCR_EL1)
+       cbnz            x1, L_set_locked_reg_panic
+#endif /* defined(HAS_VMSA_LOCK) */
        msr             TCR_EL1, x0
 #endif /* defined(KERNEL_INTRITY_KTRR) */
        isb             sy
@@ -256,7 +339,7 @@ L_set_tcr_panic_str:
 L_set_locked_reg_panic_str:
        .asciz  "attempt to set locked register: (%llx)\n"
 #else
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        mov             x1, lr
        bl              EXT(pinst_set_tcr)
        mov             lr, x1
@@ -690,6 +773,9 @@ LEXT(arm64_prepare_for_sleep)
        orr             x1, x1, #(  ARM64_REG_ACC_OVRD_ok2TrDnLnk_deepsleep)
        and             x1, x1, #(~(ARM64_REG_ACC_OVRD_ok2PwrDnCPM_mask))
        orr             x1, x1, #(  ARM64_REG_ACC_OVRD_ok2PwrDnCPM_deepsleep)
+#if HAS_RETENTION_STATE
+       orr             x1, x1, #(ARM64_REG_ACC_OVRD_disPioOnWfiCpu)
+#endif
        msr             ARM64_REG_ACC_OVRD, x1
 
 
@@ -701,9 +787,12 @@ LEXT(arm64_prepare_for_sleep)
        // Set "OK to power down" (<rdar://problem/12390433>)
        mrs             x0, ARM64_REG_CYC_OVRD
        orr             x0, x0, #(ARM64_REG_CYC_OVRD_ok2pwrdn_force_down)
+#if HAS_RETENTION_STATE
+       orr             x0, x0, #(ARM64_REG_CYC_OVRD_disWfiRetn)
+#endif
        msr             ARM64_REG_CYC_OVRD, x0
 
-#if defined(APPLEMONSOON)
+#if defined(APPLEMONSOON) || defined(APPLEVORTEX)
        ARM64_IS_PCORE x0
        cbz             x0, Lwfi_inst // skip if not p-core 
 
@@ -717,6 +806,12 @@ LEXT(arm64_prepare_for_sleep)
         * and re-enabling GUPS, which forces the prefetch queue to
         * drain.  This should be done as close to wfi as possible, i.e.
         * at the very end of arm64_prepare_for_sleep(). */
+#if defined(APPLEVORTEX)
+       /* <rdar://problem/32821461>: Cyprus A0/A1 parts have a similar
+        * bug in the HSP prefetcher that can be worked around through
+        * the same method mentioned above for Skye. */
+       SKIP_IF_CPU_VERSION_GREATER_OR_EQUAL x0, VORTEX_CPU_VERSION_B0, Lwfi_inst
+#endif
        mrs             x0, ARM64_REG_HID10
        orr             x0, x0, #(ARM64_REG_HID10_DisHwpGups)
        msr             ARM64_REG_HID10, x0
@@ -750,6 +845,21 @@ LEXT(arm64_force_wfi_clock_gate)
        ARM64_STACK_EPILOG
 
 
+#if HAS_RETENTION_STATE
+       .text
+       .align 2
+       .globl EXT(arm64_retention_wfi)
+LEXT(arm64_retention_wfi)
+       wfi
+       cbz             lr, Lwfi_retention      // If lr is 0, we entered retention state and lost all GPRs except sp and pc
+       ret                                     // Otherwise just return to cpu_idle()
+Lwfi_retention:
+       mov             x0, #1
+       bl              EXT(ClearIdlePop)
+       mov             x0, #0 
+       bl              EXT(cpu_idle_exit)      // cpu_idle_exit(from_reset = FALSE)
+       b               .                       // cpu_idle_exit() should never return
+#endif
 
 #if defined(APPLETYPHOON)
 
@@ -931,7 +1041,7 @@ LEXT(arm64_replace_bootstack)
        mrs             x4, DAIF                                        // Load current DAIF; use x4 as pinst may trash x1-x3
        msr             DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF | DAIFSC_ASYNCF)           // Disable IRQ/FIQ/serror
        // Set SP_EL1 to exception stack
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        mov             x1, lr
        bl              EXT(pinst_spsel_1)
        mov             lr, x1
index 992d501db7663c8432c5eda12427a95493598c41..cd62e333adbd8e3fe2175bc45d2bcd6d5393ce1c 100644 (file)
@@ -44,7 +44,11 @@ __BEGIN_DECLS
 
 #include <stdint.h>
 
+#if HAS_UNCORE_CTRS
+#define MT_NDEVS 2
+#else /* HAS_UNCORE_CTRS */
 #define MT_NDEVS 1
+#endif /* !HAS_UNCORE_CTRS */
 
 #define MT_CORE_CYCLES 0
 #define MT_CORE_INSTRS 1
@@ -67,6 +71,12 @@ __BEGIN_DECLS
 #define PMCR0_PMAI (UINT64_C(1) << 11)
 #define PMCR0_PMI(REG) ((REG) & PMCR0_PMAI)
 
+#if HAS_UNCORE_CTRS
+
+#define UPMSR "s3_7_c15_c6_4"
+#define UPMSR_PMI(REG) ((REG) & 0x1)
+
+#endif /* HAS_UNCORE_CTRS */
 
 static inline bool
 mt_pmi_pending(uint64_t * restrict pmcr0_out,
@@ -82,7 +92,18 @@ mt_pmi_pending(uint64_t * restrict pmcr0_out,
        }
        *pmcr0_out = pmcr0;
 
+#if HAS_UNCORE_CTRS
+       extern bool mt_uncore_enabled;
+       if (mt_uncore_enabled) {
+               uint64_t upmsr = __builtin_arm_rsr64(UPMSR);
+               if (UPMSR_PMI(upmsr)) {
+                       pmi = true;
+               }
+               *upmsr_out = upmsr;
+       }
+#else /* HAS_UNCORE_CTRS */
 #pragma unused(upmsr_out)
+#endif /* !HAS_UNCORE_CTRS */
 
        return pmi;
 }
index 25895247fc7b1bf92ef21fc53cefe1f9f22323a5..51361f693583131d2a7d10c93841d2fa943418cd 100644 (file)
@@ -281,12 +281,942 @@ core_idle(__unused cpu_data_t *cpu)
 
 #pragma mark uncore performance monitor
 
+#if HAS_UNCORE_CTRS
+
+static bool mt_uncore_initted = false;
+
+/*
+ * Uncore Performance Monitor
+ *
+ * Uncore performance monitors provide event-counting for the last-level caches
+ * (LLCs).  Each LLC has its own uncore performance monitor, which can only be
+ * accessed by cores that use that LLC.  Like the core performance monitoring
+ * unit, uncore counters are configured globally.  If there is more than one
+ * LLC on the system, PIO reads must be used to satisfy uncore requests (using
+ * the `_r` remote variants of the access functions).  Otherwise, local MSRs
+ * suffice (using the `_l` local variants of the access functions).
+ */
+
+#if UNCORE_PER_CLUSTER
+static vm_size_t cpm_impl_size = 0;
+static uintptr_t cpm_impl[__ARM_CLUSTER_COUNT__] = {};
+static uintptr_t cpm_impl_phys[__ARM_CLUSTER_COUNT__] = {};
+#endif /* UNCORE_PER_CLUSTER */
+
+#if UNCORE_VERSION >= 2
+/*
+ * V2 uncore monitors feature a CTI mechanism -- the second bit of UPMSR is
+ * used to track if a CTI has been triggered due to an overflow.
+ */
+#define UPMSR_OVF_POS 2
+#else /* UNCORE_VERSION >= 2 */
+#define UPMSR_OVF_POS 1
+#endif /* UNCORE_VERSION < 2 */
+#define UPMSR_OVF(R, CTR) ((R) >> ((CTR) + UPMSR_OVF_POS) & 0x1)
+#define UPMSR_OVF_MASK    (((UINT64_C(1) << UNCORE_NCTRS) - 1) << UPMSR_OVF_POS)
+
+#define UPMPCM "s3_7_c15_c5_4"
+#define UPMPCM_CORE(ID) (UINT64_C(1) << (ID))
+
+/*
+ * The uncore_pmi_mask is a bitmask of CPUs that receive uncore PMIs.  It's
+ * initialized by uncore_init and controllable by the uncore_pmi_mask boot-arg.
+ */
+static int32_t uncore_pmi_mask = 0;
+
+/*
+ * The uncore_active_ctrs is a bitmask of uncore counters that are currently
+ * requested.
+ */
+static uint16_t uncore_active_ctrs = 0;
+static_assert(sizeof(uncore_active_ctrs) * CHAR_BIT >= UNCORE_NCTRS,
+    "counter mask should fit the full range of counters");
+
+/*
+ * mt_uncore_enabled is true when any uncore counters are active.
+ */
+bool mt_uncore_enabled = false;
+
+/*
+ * Each uncore unit has its own monitor, corresponding to the memory hierarchy
+ * of the LLCs.
+ */
+#if UNCORE_PER_CLUSTER
+#define UNCORE_NMONITORS (__ARM_CLUSTER_COUNT__)
+#else /* UNCORE_PER_CLUSTER */
+#define UNCORE_NMONITORS (1)
+#endif /* !UNCORE_PER_CLUSTER */
+
+/*
+ * The uncore_events are the event configurations for each uncore counter -- as
+ * a union to make it easy to program the hardware registers.
+ */
+static struct uncore_config {
+       union {
+               uint8_t uce_ctrs[UNCORE_NCTRS];
+               uint64_t uce_regs[UNCORE_NCTRS / 8];
+       } uc_events;
+       union {
+               uint16_t uccm_masks[UNCORE_NCTRS];
+               uint64_t uccm_regs[UNCORE_NCTRS / 4];
+       } uc_cpu_masks[UNCORE_NMONITORS];
+} uncore_config;
+
+static struct uncore_monitor {
+       /*
+        * The last snapshot of each of the hardware counter values.
+        */
+       uint64_t um_snaps[UNCORE_NCTRS];
+
+       /*
+        * The accumulated counts for each counter.
+        */
+       uint64_t um_counts[UNCORE_NCTRS];
+
+       /*
+        * Protects accessing the hardware registers and fields in this structure.
+        */
+       lck_spin_t um_lock;
+
+       /*
+        * Whether this monitor needs its registers restored after wake.
+        */
+       bool um_sleeping;
+} uncore_monitors[UNCORE_NMONITORS];
+
+static unsigned int
+uncmon_get_curid(void)
+{
+#if UNCORE_PER_CLUSTER
+       return cpu_cluster_id();
+#else /* UNCORE_PER_CLUSTER */
+       return 0;
+#endif /* !UNCORE_PER_CLUSTER */
+}
+
+/*
+ * Per-monitor locks are required to prevent races with the PMI handlers, not
+ * from other CPUs that are configuring (those are serialized with monotonic's
+ * per-device lock).
+ */
+
+static int
+uncmon_lock(struct uncore_monitor *mon)
+{
+       int intrs_en = ml_set_interrupts_enabled(FALSE);
+       lck_spin_lock(&mon->um_lock);
+       return intrs_en;
+}
+
+static void
+uncmon_unlock(struct uncore_monitor *mon, int intrs_en)
+{
+       lck_spin_unlock(&mon->um_lock);
+       (void)ml_set_interrupts_enabled(intrs_en);
+}
+
+/*
+ * Helper functions for accessing the hardware -- these require the monitor be
+ * locked to prevent other CPUs' PMI handlers from making local modifications
+ * or updating the counts.
+ */
+
+#if UNCORE_VERSION >= 2
+#define UPMCR0_INTEN_POS 20
+#define UPMCR0_INTGEN_POS 16
+#else /* UNCORE_VERSION >= 2 */
+#define UPMCR0_INTEN_POS 12
+#define UPMCR0_INTGEN_POS 8
+#endif /* UNCORE_VERSION < 2 */
+enum {
+       UPMCR0_INTGEN_OFF = 0,
+       /* fast PMIs are only supported on core CPMU */
+       UPMCR0_INTGEN_AIC = 2,
+       UPMCR0_INTGEN_HALT = 3,
+       UPMCR0_INTGEN_FIQ = 4,
+};
+/* always enable interrupts for all counters */
+#define UPMCR0_INTEN (((1ULL << UNCORE_NCTRS) - 1) << UPMCR0_INTEN_POS)
+/* route uncore PMIs through the FIQ path */
+#define UPMCR0_INIT (UPMCR0_INTEN | (UPMCR0_INTGEN_FIQ << UPMCR0_INTGEN_POS))
+
+/*
+ * Turn counting on for counters set in the `enctrmask` and off, otherwise.
+ */
+static inline void
+uncmon_set_counting_locked_l(__unused unsigned int monid, uint64_t enctrmask)
+{
+       /*
+        * UPMCR0 controls which counters are enabled and how interrupts are generated
+        * for overflows.
+        */
+#define UPMCR0 "s3_7_c15_c0_4"
+       __builtin_arm_wsr64(UPMCR0, UPMCR0_INIT | enctrmask);
+}
+
+#if UNCORE_PER_CLUSTER
+
+/*
+ * Turn counting on for counters set in the `enctrmask` and off, otherwise.
+ */
+static inline void
+uncmon_set_counting_locked_r(unsigned int monid, uint64_t enctrmask)
+{
+       const uintptr_t upmcr0_offset = 0x4180;
+       *(uint64_t *)(cpm_impl[monid] + upmcr0_offset) = UPMCR0_INIT | enctrmask;
+}
+
+#endif /* UNCORE_PER_CLUSTER */
+
+/*
+ * The uncore performance monitoring counters (UPMCs) are 48-bits wide.  The
+ * high bit is an overflow bit, triggering a PMI, providing 47 usable bits.
+ */
+
+#define UPMC_MAX ((UINT64_C(1) << 48) - 1)
+
+/*
+ * The `__builtin_arm_{r,w}sr` functions require constant strings, since the
+ * MSR/MRS instructions encode the registers as immediates.  Otherwise, this
+ * would be indexing into an array of strings.
+ */
+
+#define UPMC0 "s3_7_c15_c7_4"
+#define UPMC1 "s3_7_c15_c8_4"
+#define UPMC2 "s3_7_c15_c9_4"
+#define UPMC3 "s3_7_c15_c10_4"
+#define UPMC4 "s3_7_c15_c11_4"
+#define UPMC5 "s3_7_c15_c12_4"
+#define UPMC6 "s3_7_c15_c13_4"
+#define UPMC7 "s3_7_c15_c14_4"
+#if UNCORE_NCTRS > 8
+#define UPMC8  "s3_7_c15_c0_5"
+#define UPMC9  "s3_7_c15_c1_5"
+#define UPMC10 "s3_7_c15_c2_5"
+#define UPMC11 "s3_7_c15_c3_5"
+#define UPMC12 "s3_7_c15_c4_5"
+#define UPMC13 "s3_7_c15_c5_5"
+#define UPMC14 "s3_7_c15_c6_5"
+#define UPMC15 "s3_7_c15_c7_5"
+#endif /* UNCORE_NCTRS > 8 */
+
+#define UPMC_0_7(X, A) X(0, A); X(1, A); X(2, A); X(3, A); X(4, A); X(5, A); \
+               X(6, A); X(7, A)
+#if UNCORE_NCTRS <= 8
+#define UPMC_ALL(X, A) UPMC_0_7(X, A)
+#else /* UNCORE_NCTRS <= 8 */
+#define UPMC_8_15(X, A) X(8, A); X(9, A); X(10, A); X(11, A); X(12, A); \
+               X(13, A); X(14, A); X(15, A)
+#define UPMC_ALL(X, A) UPMC_0_7(X, A); UPMC_8_15(X, A)
+#endif /* UNCORE_NCTRS > 8 */
+
+static inline uint64_t
+uncmon_read_counter_locked_l(__unused unsigned int monid, unsigned int ctr)
+{
+       assert(ctr < UNCORE_NCTRS);
+       switch (ctr) {
+#define UPMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(UPMC ## CTR)
+               UPMC_ALL(UPMC_RD, 0);
+#undef UPMC_RD
+       default:
+               panic("monotonic: invalid counter read %u", ctr);
+               __builtin_unreachable();
+       }
+}
+
+static inline void
+uncmon_write_counter_locked_l(__unused unsigned int monid, unsigned int ctr,
+    uint64_t count)
+{
+       assert(count < UPMC_MAX);
+       assert(ctr < UNCORE_NCTRS);
+       switch (ctr) {
+#define UPMC_WR(CTR, COUNT) case (CTR): \
+               return __builtin_arm_wsr64(UPMC ## CTR, (COUNT))
+               UPMC_ALL(UPMC_WR, count);
+#undef UPMC_WR
+       default:
+               panic("monotonic: invalid counter write %u", ctr);
+       }
+}
+
+#if UNCORE_PER_CLUSTER
+
+static const uint8_t clust_offs[__ARM_CLUSTER_COUNT__] = CPU_CLUSTER_OFFSETS;
+
+uintptr_t upmc_offs[UNCORE_NCTRS] = {
+       [0] = 0x4100, [1] = 0x4248, [2] = 0x4110, [3] = 0x4250, [4] = 0x4120,
+       [5] = 0x4258, [6] = 0x4130, [7] = 0x4260, [8] = 0x4140, [9] = 0x4268,
+       [10] = 0x4150, [11] = 0x4270, [12] = 0x4160, [13] = 0x4278,
+       [14] = 0x4170, [15] = 0x4280,
+};
+
+static inline uint64_t
+uncmon_read_counter_locked_r(unsigned int mon_id, unsigned int ctr)
+{
+       assert(mon_id < __ARM_CLUSTER_COUNT__);
+       assert(ctr < UNCORE_NCTRS);
+       return *(uint64_t *)(cpm_impl[mon_id] + upmc_offs[ctr]);
+}
+
+static inline void
+uncmon_write_counter_locked_r(unsigned int mon_id, unsigned int ctr,
+    uint64_t count)
+{
+       assert(count < UPMC_MAX);
+       assert(ctr < UNCORE_NCTRS);
+       assert(mon_id < __ARM_CLUSTER_COUNT__);
+       *(uint64_t *)(cpm_impl[mon_id] + upmc_offs[ctr]) = count;
+}
+
+#endif /* UNCORE_PER_CLUSTER */
+
+static inline void
+uncmon_update_locked(unsigned int monid, unsigned int curid, unsigned int ctr)
+{
+       struct uncore_monitor *mon = &uncore_monitors[monid];
+       uint64_t snap = 0;
+       if (curid == monid) {
+               snap = uncmon_read_counter_locked_l(monid, ctr);
+       } else {
+#if UNCORE_PER_CLUSTER
+               snap = uncmon_read_counter_locked_r(monid, ctr);
+#endif /* UNCORE_PER_CLUSTER */
+       }
+       /* counters should increase monotonically */
+       assert(snap >= mon->um_snaps[ctr]);
+       mon->um_counts[ctr] += snap - mon->um_snaps[ctr];
+       mon->um_snaps[ctr] = snap;
+}
+
+static inline void
+uncmon_program_events_locked_l(unsigned int monid)
+{
+       /*
+        * UPMESR[01] is the event selection register that determines which event a
+        * counter will count.
+        */
+#define UPMESR0 "s3_7_c15_c1_4"
+       CTRL_REG_SET(UPMESR0, uncore_config.uc_events.uce_regs[0]);
+
+#if UNCORE_NCTRS > 8
+#define UPMESR1 "s3_7_c15_c11_5"
+       CTRL_REG_SET(UPMESR1, uncore_config.uc_events.uce_regs[1]);
+#endif /* UNCORE_NCTRS > 8 */
+
+       /*
+        * UPMECM[0123] are the event core masks for each counter -- whether or not
+        * that counter counts events generated by an agent.  These are set to all
+        * ones so the uncore counters count events from all cores.
+        *
+        * The bits are based off the start of the cluster -- e.g. even if a core
+        * has a CPU ID of 4, it might be the first CPU in a cluster.  Shift the
+        * registers right by the ID of the first CPU in the cluster.
+        */
+#define UPMECM0 "s3_7_c15_c3_4"
+#define UPMECM1 "s3_7_c15_c4_4"
+
+       CTRL_REG_SET(UPMECM0,
+           uncore_config.uc_cpu_masks[monid].uccm_regs[0]);
+       CTRL_REG_SET(UPMECM1,
+           uncore_config.uc_cpu_masks[monid].uccm_regs[1]);
+
+#if UNCORE_NCTRS > 8
+#define UPMECM2 "s3_7_c15_c8_5"
+#define UPMECM3 "s3_7_c15_c9_5"
+
+       CTRL_REG_SET(UPMECM2,
+           uncore_config.uc_cpu_masks[monid].uccm_regs[2]);
+       CTRL_REG_SET(UPMECM3,
+           uncore_config.uc_cpu_masks[monid].uccm_regs[3]);
+#endif /* UNCORE_NCTRS > 8 */
+}
+
+#if UNCORE_PER_CLUSTER
+
+static inline void
+uncmon_program_events_locked_r(unsigned int monid)
+{
+       const uintptr_t upmesr_offs[2] = {[0] = 0x41b0, [1] = 0x41b8, };
+
+       for (unsigned int i = 0; i < sizeof(upmesr_offs) / sizeof(upmesr_offs[0]);
+           i++) {
+               *(uint64_t *)(cpm_impl[monid] + upmesr_offs[i]) =
+                   uncore_config.uc_events.uce_regs[i];
+       }
+
+       const uintptr_t upmecm_offs[4] = {
+               [0] = 0x4190, [1] = 0x4198, [2] = 0x41a0, [3] = 0x41a8,
+       };
+
+       for (unsigned int i = 0; i < sizeof(upmecm_offs) / sizeof(upmecm_offs[0]);
+           i++) {
+               *(uint64_t *)(cpm_impl[monid] + upmecm_offs[i]) =
+                   uncore_config.uc_cpu_masks[monid].uccm_regs[i];
+       }
+}
+
+#endif /* UNCORE_PER_CLUSTER */
+
+static void
+uncmon_clear_int_locked_l(__unused unsigned int monid)
+{
+       __builtin_arm_wsr64(UPMSR, 0);
+}
+
+#if UNCORE_PER_CLUSTER
+
+static void
+uncmon_clear_int_locked_r(unsigned int monid)
+{
+       const uintptr_t upmsr_off = 0x41c0;
+       *(uint64_t *)(cpm_impl[monid] + upmsr_off) = 0;
+}
+
+#endif /* UNCORE_PER_CLUSTER */
+
+/*
+ * Get the PMI mask for the provided `monid` -- that is, the bitmap of CPUs
+ * that should be sent PMIs for a particular monitor.
+ */
+static uint64_t
+uncmon_get_pmi_mask(unsigned int monid)
+{
+       uint64_t pmi_mask = uncore_pmi_mask;
+
+#if UNCORE_PER_CLUSTER
+       /*
+        * Set up the mask for the high bits.
+        */
+       uint64_t clust_cpumask;
+       if (monid == __ARM_CLUSTER_COUNT__ - 1) {
+               clust_cpumask = UINT64_MAX;
+       } else {
+               clust_cpumask = ((1ULL << clust_offs[monid + 1]) - 1);
+       }
+
+       /*
+        * Mask off the low bits, if necessary.
+        */
+       if (clust_offs[monid] != 0) {
+               clust_cpumask &= ~((1ULL << clust_offs[monid]) - 1);
+       }
+
+       pmi_mask &= clust_cpumask;
+#else /* UNCORE_PER_CLUSTER */
+#pragma unused(monid)
+#endif /* !UNCORE_PER_CLUSTER */
+
+       return pmi_mask;
+}
+
+/*
+ * Initialization routines for the uncore counters.
+ */
+
+static void
+uncmon_init_locked_l(unsigned int monid)
+{
+       /*
+        * UPMPCM defines the PMI core mask for the UPMCs -- which cores should
+        * receive interrupts on overflow.
+        */
+       CTRL_REG_SET(UPMPCM, uncmon_get_pmi_mask(monid));
+       uncmon_set_counting_locked_l(monid,
+           mt_uncore_enabled ? uncore_active_ctrs : 0);
+}
+
+#if UNCORE_PER_CLUSTER
+
+static vm_size_t acc_impl_size = 0;
+static uintptr_t acc_impl[__ARM_CLUSTER_COUNT__] = {};
+static uintptr_t acc_impl_phys[__ARM_CLUSTER_COUNT__] = {};
+
+static void
+uncmon_init_locked_r(unsigned int monid)
+{
+       const uintptr_t upmpcm_off = 0x1010;
+
+       *(uint64_t *)(acc_impl[monid] + upmpcm_off) = uncmon_get_pmi_mask(monid);
+       uncmon_set_counting_locked_r(monid,
+           mt_uncore_enabled ? uncore_active_ctrs : 0);
+}
+
+#endif /* UNCORE_PER_CLUSTER */
+
+/*
+ * Initialize the uncore device for monotonic.
+ */
+static int
+uncore_init(__unused mt_device_t dev)
+{
+#if DEVELOPMENT || DEBUG
+       /*
+        * Development and debug kernels observe the `uncore_pmi_mask` boot-arg,
+        * allowing PMIs to be routed to the CPUs present in the supplied bitmap.
+        * Do some sanity checks on the value provided.
+        */
+       bool parsed_arg = PE_parse_boot_argn("uncore_pmi_mask", &uncore_pmi_mask,
+           sizeof(uncore_pmi_mask));
+       if (parsed_arg) {
+#if UNCORE_PER_CLUSTER
+               if (__builtin_popcount(uncore_pmi_mask) != __ARM_CLUSTER_COUNT__) {
+                       panic("monotonic: invalid uncore PMI mask 0x%x", uncore_pmi_mask);
+               }
+               for (unsigned int i = 0; i < __ARM_CLUSTER_COUNT__; i++) {
+                       if (__builtin_popcountll(uncmon_get_pmi_mask(i)) != 1) {
+                               panic("monotonic: invalid uncore PMI CPU for cluster %d in mask 0x%x",
+                                   i, uncore_pmi_mask);
+                       }
+               }
+#else /* UNCORE_PER_CLUSTER */
+               if (__builtin_popcount(uncore_pmi_mask) != 1) {
+                       panic("monotonic: invalid uncore PMI mask 0x%x", uncore_pmi_mask);
+               }
+#endif /* !UNCORE_PER_CLUSTER */
+       } else
+#endif /* DEVELOPMENT || DEBUG */
+       {
+#if UNCORE_PER_CLUSTER
+               for (int i = 0; i < __ARM_CLUSTER_COUNT__; i++) {
+                       /* route to the first CPU in each cluster */
+                       uncore_pmi_mask |= (1ULL << clust_offs[i]);
+               }
+#else /* UNCORE_PER_CLUSTER */
+               /* arbitrarily route to core 0 */
+               uncore_pmi_mask |= 1;
+#endif /* !UNCORE_PER_CLUSTER */
+       }
+       assert(uncore_pmi_mask != 0);
+
+       unsigned int curmonid = uncmon_get_curid();
+
+       for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) {
+#if UNCORE_PER_CLUSTER
+               cpm_impl[monid] = (uintptr_t)ml_io_map(cpm_impl_phys[monid],
+                   cpm_impl_size);
+               assert(cpm_impl[monid] != 0);
+
+               acc_impl[monid] = (uintptr_t)ml_io_map(acc_impl_phys[monid],
+                   acc_impl_size);
+               assert(acc_impl[monid] != 0);
+#endif /* UNCORE_PER_CLUSTER */
+
+               struct uncore_monitor *mon = &uncore_monitors[monid];
+               lck_spin_init(&mon->um_lock, mt_lock_grp, NULL);
+
+               int intrs_en = uncmon_lock(mon);
+               if (monid != curmonid) {
+#if UNCORE_PER_CLUSTER
+                       uncmon_init_locked_r(monid);
+#endif /* UNCORE_PER_CLUSTER */
+               } else {
+                       uncmon_init_locked_l(monid);
+               }
+               uncmon_unlock(mon, intrs_en);
+       }
+
+       mt_uncore_initted = true;
+
+       return 0;
+}
+
+/*
+ * Support for monotonic's mtd_read function.
+ */
+
+static void
+uncmon_read_all_counters(unsigned int monid, unsigned int curmonid,
+    uint64_t ctr_mask, uint64_t *counts)
+{
+       struct uncore_monitor *mon = &uncore_monitors[monid];
+
+       int intrs_en = uncmon_lock(mon);
+
+       for (unsigned int ctr = 0; ctr < UNCORE_NCTRS; ctr++) {
+               if (ctr_mask & (1ULL << ctr)) {
+                       uncmon_update_locked(monid, curmonid, ctr);
+                       counts[ctr] = mon->um_counts[ctr];
+               }
+       }
+
+       uncmon_unlock(mon, intrs_en);
+}
+
+/*
+ * Read all monitor's counters.
+ */
+static int
+uncore_read(uint64_t ctr_mask, uint64_t *counts_out)
+{
+       assert(ctr_mask != 0);
+       assert(counts_out != NULL);
+
+       if (!uncore_active_ctrs) {
+               return EPWROFF;
+       }
+       if (ctr_mask & ~uncore_active_ctrs) {
+               return EINVAL;
+       }
+
+       unsigned int curmonid = uncmon_get_curid();
+       for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) {
+               /*
+                * Find this monitor's starting offset into the `counts_out` array.
+                */
+               uint64_t *counts = counts_out + (UNCORE_NCTRS * monid);
+
+               uncmon_read_all_counters(monid, curmonid, ctr_mask, counts);
+       }
+
+       return 0;
+}
+
+/*
+ * Support for monotonic's mtd_add function.
+ */
+
+/*
+ * Add an event to the current uncore configuration.  This doesn't take effect
+ * until the counters are enabled again, so there's no need to involve the
+ * monitors.
+ */
+static int
+uncore_add(struct monotonic_config *config, uint32_t *ctr_out)
+{
+       if (mt_uncore_enabled) {
+               return EBUSY;
+       }
+
+       uint32_t available = ~uncore_active_ctrs & config->allowed_ctr_mask;
+
+       if (available == 0) {
+               return ENOSPC;
+       }
+
+       uint32_t valid_ctrs = (UINT32_C(1) << UNCORE_NCTRS) - 1;
+       if ((available & valid_ctrs) == 0) {
+               return E2BIG;
+       }
+
+       uint32_t ctr = __builtin_ffsll(available) - 1;
+
+       uncore_active_ctrs |= UINT64_C(1) << ctr;
+       uncore_config.uc_events.uce_ctrs[ctr] = config->event;
+       uint64_t cpu_mask = UINT64_MAX;
+       if (config->cpu_mask != 0) {
+               cpu_mask = config->cpu_mask;
+       }
+       for (int i = 0; i < UNCORE_NMONITORS; i++) {
+#if UNCORE_PER_CLUSTER
+               const unsigned int shift = clust_offs[i];
+#else /* UNCORE_PER_CLUSTER */
+               const unsigned int shift = 0;
+#endif /* !UNCORE_PER_CLUSTER */
+               uncore_config.uc_cpu_masks[i].uccm_masks[ctr] = cpu_mask >> shift;
+       }
+
+       *ctr_out = ctr;
+       return 0;
+}
+
+/*
+ * Support for monotonic's mtd_reset function.
+ */
+
+/*
+ * Reset all configuration and disable the counters if they're currently
+ * counting.
+ */
+static void
+uncore_reset(void)
+{
+       mt_uncore_enabled = false;
+
+       unsigned int curmonid = uncmon_get_curid();
+
+       for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) {
+               struct uncore_monitor *mon = &uncore_monitors[monid];
+               bool remote = monid != curmonid;
+
+               int intrs_en = uncmon_lock(mon);
+               if (remote) {
+#if UNCORE_PER_CLUSTER
+                       uncmon_set_counting_locked_r(monid, 0);
+#endif /* UNCORE_PER_CLUSTER */
+               } else {
+                       uncmon_set_counting_locked_l(monid, 0);
+               }
+
+               for (int ctr = 0; ctr < UNCORE_NCTRS; ctr++) {
+                       if (uncore_active_ctrs & (1U << ctr)) {
+                               if (remote) {
+#if UNCORE_PER_CLUSTER
+                                       uncmon_write_counter_locked_r(monid, ctr, 0);
+#endif /* UNCORE_PER_CLUSTER */
+                               } else {
+                                       uncmon_write_counter_locked_l(monid, ctr, 0);
+                               }
+                       }
+               }
+
+               memset(&mon->um_snaps, 0, sizeof(mon->um_snaps));
+               memset(&mon->um_counts, 0, sizeof(mon->um_counts));
+               if (remote) {
+#if UNCORE_PER_CLUSTER
+                       uncmon_clear_int_locked_r(monid);
+#endif /* UNCORE_PER_CLUSTER */
+               } else {
+                       uncmon_clear_int_locked_l(monid);
+               }
+
+               uncmon_unlock(mon, intrs_en);
+       }
+
+       uncore_active_ctrs = 0;
+       memset(&uncore_config, 0, sizeof(uncore_config));
+
+       for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) {
+               struct uncore_monitor *mon = &uncore_monitors[monid];
+               bool remote = monid != curmonid;
+
+               int intrs_en = uncmon_lock(mon);
+               if (remote) {
+#if UNCORE_PER_CLUSTER
+                       uncmon_program_events_locked_r(monid);
+#endif /* UNCORE_PER_CLUSTER */
+               } else {
+                       uncmon_program_events_locked_l(monid);
+               }
+               uncmon_unlock(mon, intrs_en);
+       }
+}
+
+/*
+ * Support for monotonic's mtd_enable function.
+ */
+
+static void
+uncmon_set_enabled_l(unsigned int monid, bool enable)
+{
+       struct uncore_monitor *mon = &uncore_monitors[monid];
+       int intrs_en = uncmon_lock(mon);
+
+       if (enable) {
+               uncmon_program_events_locked_l(monid);
+               uncmon_set_counting_locked_l(monid, uncore_active_ctrs);
+       } else {
+               uncmon_set_counting_locked_l(monid, 0);
+       }
+
+       uncmon_unlock(mon, intrs_en);
+}
+
+#if UNCORE_PER_CLUSTER
+
+static void
+uncmon_set_enabled_r(unsigned int monid, bool enable)
+{
+       struct uncore_monitor *mon = &uncore_monitors[monid];
+       int intrs_en = uncmon_lock(mon);
+
+       if (enable) {
+               uncmon_program_events_locked_r(monid);
+               uncmon_set_counting_locked_r(monid, uncore_active_ctrs);
+       } else {
+               uncmon_set_counting_locked_r(monid, 0);
+       }
+
+       uncmon_unlock(mon, intrs_en);
+}
+
+#endif /* UNCORE_PER_CLUSTER */
+
+static void
+uncore_set_enabled(bool enable)
+{
+       mt_uncore_enabled = enable;
+
+       unsigned int curmonid = uncmon_get_curid();
+       for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) {
+               if (monid != curmonid) {
+#if UNCORE_PER_CLUSTER
+                       uncmon_set_enabled_r(monid, enable);
+#endif /* UNCORE_PER_CLUSTER */
+               } else {
+                       uncmon_set_enabled_l(monid, enable);
+               }
+       }
+}
+
+/*
+ * Hooks in the machine layer.
+ */
+
+static void
+uncore_fiq(uint64_t upmsr)
+{
+       /*
+        * Determine which counters overflowed.
+        */
+       uint64_t disable_ctr_mask = (upmsr & UPMSR_OVF_MASK) >> UPMSR_OVF_POS;
+       /* should not receive interrupts from inactive counters */
+       assert(!(disable_ctr_mask & ~uncore_active_ctrs));
+
+       unsigned int monid = uncmon_get_curid();
+       struct uncore_monitor *mon = &uncore_monitors[monid];
+
+       int intrs_en = uncmon_lock(mon);
+
+       /*
+        * Disable any counters that overflowed.
+        */
+       uncmon_set_counting_locked_l(monid,
+           uncore_active_ctrs & ~disable_ctr_mask);
+
+       /*
+        * With the overflowing counters disabled, capture their counts and reset
+        * the UPMCs and their snapshots to 0.
+        */
+       for (unsigned int ctr = 0; ctr < UNCORE_NCTRS; ctr++) {
+               if (UPMSR_OVF(upmsr, ctr)) {
+                       uncmon_update_locked(monid, monid, ctr);
+                       mon->um_snaps[ctr] = 0;
+                       uncmon_write_counter_locked_l(monid, ctr, 0);
+               }
+       }
+
+       /*
+        * Acknowledge the interrupt, now that any overflowed PMCs have been reset.
+        */
+       uncmon_clear_int_locked_l(monid);
+
+       /*
+        * Re-enable all active counters.
+        */
+       uncmon_set_counting_locked_l(monid, uncore_active_ctrs);
+
+       uncmon_unlock(mon, intrs_en);
+}
+
+static void
+uncore_save(void)
+{
+       if (!uncore_active_ctrs) {
+               return;
+       }
+
+       unsigned int curmonid = uncmon_get_curid();
+
+       for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) {
+               struct uncore_monitor *mon = &uncore_monitors[monid];
+               int intrs_en = uncmon_lock(mon);
+
+               if (mt_uncore_enabled) {
+                       if (monid != curmonid) {
+#if UNCORE_PER_CLUSTER
+                               uncmon_set_counting_locked_r(monid, 0);
+#endif /* UNCORE_PER_CLUSTER */
+                       } else {
+                               uncmon_set_counting_locked_l(monid, 0);
+                       }
+               }
+
+               for (unsigned int ctr = 0; ctr < UNCORE_NCTRS; ctr++) {
+                       if (uncore_active_ctrs & (1U << ctr)) {
+                               uncmon_update_locked(monid, curmonid, ctr);
+                       }
+               }
+
+               mon->um_sleeping = true;
+               uncmon_unlock(mon, intrs_en);
+       }
+}
+
+static void
+uncore_restore(void)
+{
+       if (!uncore_active_ctrs) {
+               return;
+       }
+       unsigned int curmonid = uncmon_get_curid();
+
+       struct uncore_monitor *mon = &uncore_monitors[curmonid];
+       int intrs_en = uncmon_lock(mon);
+       if (!mon->um_sleeping) {
+               goto out;
+       }
+
+       for (unsigned int ctr = 0; ctr < UNCORE_NCTRS; ctr++) {
+               if (uncore_active_ctrs & (1U << ctr)) {
+                       uncmon_write_counter_locked_l(curmonid, ctr, mon->um_snaps[ctr]);
+               }
+       }
+       uncmon_program_events_locked_l(curmonid);
+       uncmon_init_locked_l(curmonid);
+       mon->um_sleeping = false;
+
+out:
+       uncmon_unlock(mon, intrs_en);
+}
+
+static void
+uncore_early_init(void)
+{
+#if UNCORE_PER_CLUSTER
+       /*
+        * Initialize the necessary PIO physical regions from the device tree.
+        */
+       DTEntry armio_entry = NULL;
+       if ((DTFindEntry("name", "arm-io", &armio_entry) != kSuccess)) {
+               panic("unable to find arm-io DT entry");
+       }
+
+       uint64_t *regs;
+       unsigned int regs_size = 0;
+       if (DTGetProperty(armio_entry, "acc-impl", (void **)&regs, &regs_size) !=
+           kSuccess) {
+               panic("unable to find acc-impl DT property");
+       }
+       /*
+        * Two 8-byte values are expected for each cluster -- the physical address
+        * of the region and its size.
+        */
+       const unsigned int expected_size =
+           (typeof(expected_size))sizeof(uint64_t) * __ARM_CLUSTER_COUNT__ * 2;
+       if (regs_size != expected_size) {
+               panic("invalid size for acc-impl DT property");
+       }
+       for (int i = 0; i < __ARM_CLUSTER_COUNT__; i++) {
+               acc_impl_phys[i] = regs[i * 2];
+       }
+       acc_impl_size = regs[1];
+
+       regs_size = 0;
+       if (DTGetProperty(armio_entry, "cpm-impl", (void **)&regs, &regs_size) !=
+           kSuccess) {
+               panic("unable to find cpm-impl property");
+       }
+       if (regs_size != expected_size) {
+               panic("invalid size for cpm-impl DT property");
+       }
+       for (int i = 0; i < __ARM_CLUSTER_COUNT__; i++) {
+               cpm_impl_phys[i] = regs[i * 2];
+       }
+       cpm_impl_size = regs[1];
+#endif /* UNCORE_PER_CLUSTER */
+}
+
+#endif /* HAS_UNCORE_CTRS */
 
 #pragma mark common hooks
 
 void
 mt_early_init(void)
 {
+#if HAS_UNCORE_CTRS
+       uncore_early_init();
+#endif /* HAS_UNCORE_CTRS */
 }
 
 void
@@ -330,11 +1260,19 @@ mt_cpu_up(cpu_data_t *cpu)
 void
 mt_sleep(void)
 {
+#if HAS_UNCORE_CTRS
+       uncore_save();
+#endif /* HAS_UNCORE_CTRS */
 }
 
 void
 mt_wake_per_core(void)
 {
+#if HAS_UNCORE_CTRS
+       if (mt_uncore_initted) {
+               uncore_restore();
+       }
+#endif /* HAS_UNCORE_CTRS */
 }
 
 uint64_t
@@ -439,7 +1377,11 @@ mt_fiq(void *cpu, uint64_t pmcr0, uint64_t upmsr)
        mt_cpu_pmi(cpu, pmcr0);
 #endif /* !CPMU_AIC_PMI */
 
+#if HAS_UNCORE_CTRS
+       uncore_fiq(upmsr);
+#else /* HAS_UNCORE_CTRS */
 #pragma unused(upmsr)
+#endif /* !HAS_UNCORE_CTRS */
 }
 
 static uint32_t mt_xc_sync;
@@ -487,6 +1429,19 @@ struct mt_device mt_devices[] = {
                .mtd_name = "core",
                .mtd_init = core_init,
        },
+#if HAS_UNCORE_CTRS
+       [1] = {
+               .mtd_name = "uncore",
+               .mtd_init = uncore_init,
+               .mtd_add = uncore_add,
+               .mtd_reset = uncore_reset,
+               .mtd_enable = uncore_set_enabled,
+               .mtd_read = uncore_read,
+
+               .mtd_nmonitors = UNCORE_NMONITORS,
+               .mtd_ncounters = UNCORE_NCTRS,
+       }
+#endif /* HAS_UNCORE_CTRS */
 };
 
 static_assert(
index 740a6391528c153b0b44875e2c31becfb184f592..71c1230f8450fe7ee21fee4342d70c4a1b279909 100644 (file)
@@ -104,7 +104,7 @@ _pinst_set_sctlr:
 
 #endif /* defined(KERNEL_INTEGRITY_KTRR) */
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
 
        .text
        .section        __LAST,__pinst
@@ -123,5 +123,48 @@ _pinst_spsel_1:
        check_instruction x2, x3, __pinst_spsel_1, 0xd65f03c0d50041bf
        b __pinst_spsel_1
 
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#if __APRR_SUPPORTED__
+
+/*
+ * APRR registers aren't covered by VMSA lockdown, so we'll keep these
+ * gadgets in pinst for protection against undesired execution. 
+ */
+
+       .text
+       .section        __LAST,__pinst
+       .align 2
+
+__pinst_set_aprr_el0:
+       msr             APRR_EL0, x0
+       ret
+
+__pinst_set_aprr_el1:
+       msr             APRR_EL1, x0
+       ret
+
+__pinst_set_aprr_shadow_mask_en_el1:
+       msr             APRR_SHADOW_MASK_EN_EL1, x0
+
+       ret
+
+       .text
+       .section        __TEXT_EXEC,__text
+       .align 2
+
+       .globl _pinst_set_aprr_el0
+_pinst_set_aprr_el0:
+       check_instruction x2, x3, __pinst_set_aprr_el0, 0xd65f03c0d51cf200
+       b __pinst_set_aprr_el0
+
+       .globl _pinst_set_aprr_el1
+_pinst_set_aprr_el1:
+       check_instruction x2, x3, __pinst_set_aprr_el1, 0xd65f03c0d51cf220
+       b __pinst_set_aprr_el1
+
+       .globl _pinst_set_aprr_shadow_mask_en_el1
+_pinst_set_aprr_shadow_mask_en_el1:
+       check_instruction x2, x3, __pinst_set_aprr_shadow_mask_en_el1, 0xd65f03c0d51cf2c0
+       b __pinst_set_aprr_shadow_mask_en_el1
+#endif /* __APRR_SUPPORTED__ */
+#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
 
index 8523c57ab2c2eda1ef81e7f13a82e656577fc579..2fd98c9f7545d67fb6bb72402abf47dde86ff833 100644 (file)
@@ -89,6 +89,10 @@ kern_return_t arm64_late_pan_test(void);
 #include <ptrauth.h>
 kern_return_t arm64_ropjop_test(void);
 #endif
+#if defined(KERNEL_INTEGRITY_CTRR)
+kern_return_t ctrr_test(void);
+kern_return_t ctrr_test_cpu(void);
+#endif
 #if HAS_TWO_STAGE_SPR_LOCK
 kern_return_t arm64_spr_lock_test(void);
 extern void arm64_msr_lock_test(uint64_t);
@@ -270,6 +274,46 @@ lt_upgrade_downgrade_rw()
        lck_rw_done(&lt_rwlock);
 }
 
+#if __AMP__
+const int limit = 1000000;
+static int lt_stress_local_counters[MAX_CPUS];
+
+lck_ticket_t lt_ticket_lock;
+
+static void
+lt_stress_ticket_lock()
+{
+       int local_counter = 0;
+
+       uint cpuid = current_processor()->cpu_id;
+
+       kprintf("%s>cpu %d starting\n", __FUNCTION__, cpuid);
+
+       lck_ticket_lock(&lt_ticket_lock);
+       lt_counter++;
+       local_counter++;
+       lck_ticket_unlock(&lt_ticket_lock);
+
+       while (lt_counter < lt_target_done_threads) {
+               ;
+       }
+
+       kprintf("%s>cpu %d started\n", __FUNCTION__, cpuid);
+
+       while (lt_counter < limit) {
+               lck_ticket_lock(&lt_ticket_lock);
+               if (lt_counter < limit) {
+                       lt_counter++;
+                       local_counter++;
+               }
+               lck_ticket_unlock(&lt_ticket_lock);
+       }
+
+       lt_stress_local_counters[cpuid] = local_counter;
+
+       kprintf("%s>final counter %d cpu %d incremented the counter %d times\n", __FUNCTION__, lt_counter, cpuid, local_counter);
+}
+#endif
 
 static void
 lt_grab_hw_lock()
@@ -571,6 +615,106 @@ lt_start_lock_thread(thread_continue_t func)
        thread_deallocate(thread);
 }
 
+#if __AMP__
+static void
+lt_bound_thread(void *arg, wait_result_t wres __unused)
+{
+       void (*func)(void) = (void (*)(void))arg;
+
+       int cpuid = OSIncrementAtomic((volatile SInt32 *)&lt_cpu_bind_id);
+
+       processor_t processor = processor_list;
+       while ((processor != NULL) && (processor->cpu_id != cpuid)) {
+               processor = processor->processor_list;
+       }
+
+       if (processor != NULL) {
+               thread_bind(processor);
+       }
+
+       thread_block(THREAD_CONTINUE_NULL);
+
+       func();
+
+       OSIncrementAtomic((volatile SInt32*) &lt_done_threads);
+}
+
+static void
+lt_e_thread(void *arg, wait_result_t wres __unused)
+{
+       void (*func)(void) = (void (*)(void))arg;
+
+       thread_t thread = current_thread();
+
+       spl_t s = splsched();
+       thread_lock(thread);
+       thread->sched_flags |= TH_SFLAG_ECORE_ONLY;
+       thread_unlock(thread);
+       splx(s);
+
+       thread_block(THREAD_CONTINUE_NULL);
+
+       func();
+
+       OSIncrementAtomic((volatile SInt32*) &lt_done_threads);
+}
+
+static void
+lt_p_thread(void *arg, wait_result_t wres __unused)
+{
+       void (*func)(void) = (void (*)(void))arg;
+
+       thread_t thread = current_thread();
+
+       spl_t s = splsched();
+       thread_lock(thread);
+       thread->sched_flags |= TH_SFLAG_PCORE_ONLY;
+       thread_unlock(thread);
+       splx(s);
+
+       thread_block(THREAD_CONTINUE_NULL);
+
+       func();
+
+       OSIncrementAtomic((volatile SInt32*) &lt_done_threads);
+}
+
+static void
+lt_start_lock_thread_e(thread_continue_t func)
+{
+       thread_t thread;
+       kern_return_t kr;
+
+       kr = kernel_thread_start(lt_e_thread, func, &thread);
+       assert(kr == KERN_SUCCESS);
+
+       thread_deallocate(thread);
+}
+
+static void
+lt_start_lock_thread_p(thread_continue_t func)
+{
+       thread_t thread;
+       kern_return_t kr;
+
+       kr = kernel_thread_start(lt_p_thread, func, &thread);
+       assert(kr == KERN_SUCCESS);
+
+       thread_deallocate(thread);
+}
+
+static void
+lt_start_lock_thread_bound(thread_continue_t func)
+{
+       thread_t thread;
+       kern_return_t kr;
+
+       kr = kernel_thread_start(lt_bound_thread, func, &thread);
+       assert(kr == KERN_SUCCESS);
+
+       thread_deallocate(thread);
+}
+#endif
 
 static kern_return_t
 lt_test_locks()
@@ -762,6 +906,47 @@ lt_test_locks()
        lt_wait_for_lock_test_threads();
        T_EXPECT_EQ_UINT(lt_counter, LOCK_TEST_ITERATIONS * lt_target_done_threads, NULL);
 
+#if __AMP__
+       /* Ticket locks stress test */
+       T_LOG("Running Ticket locks stress test with lck_ticket_lock()");
+       extern unsigned int real_ncpus;
+       lck_ticket_init(&lt_ticket_lock);
+       lt_reset();
+       lt_target_done_threads = real_ncpus;
+       for (processor_t processor = processor_list; processor != NULL; processor = processor->processor_list) {
+               lt_start_lock_thread_bound(lt_stress_ticket_lock);
+       }
+       lt_wait_for_lock_test_threads();
+       bool starvation = false;
+       uint total_local_count = 0;
+       for (processor_t processor = processor_list; processor != NULL; processor = processor->processor_list) {
+               starvation = starvation || (lt_stress_local_counters[processor->cpu_id] < 10);
+               total_local_count += lt_stress_local_counters[processor->cpu_id];
+       }
+       if (total_local_count != lt_counter) {
+               T_FAIL("Lock failure\n");
+       } else if (starvation) {
+               T_FAIL("Lock starvation found\n");
+       } else {
+               T_PASS("Ticket locks stress test with lck_ticket_lock()");
+       }
+
+       /* AMP ticket locks stress test */
+       T_LOG("Running AMP Ticket locks stress test bound to clusters with lck_ticket_lock()");
+       lt_reset();
+       lt_target_done_threads = real_ncpus;
+       for (processor_t processor = processor_list; processor != NULL; processor = processor->processor_list) {
+               processor_set_t pset = processor->processor_set;
+               if (pset->pset_cluster_type == PSET_AMP_P) {
+                       lt_start_lock_thread_p(lt_stress_ticket_lock);
+               } else if (pset->pset_cluster_type == PSET_AMP_E) {
+                       lt_start_lock_thread_e(lt_stress_ticket_lock);
+               } else {
+                       lt_start_lock_thread(lt_stress_ticket_lock);
+               }
+       }
+       lt_wait_for_lock_test_threads();
+#endif
 
        /* HW locks: trylocks */
        T_LOG("Running test with hw_lock_try()");
@@ -1198,6 +1383,136 @@ arm64_munger_test()
        return 0;
 }
 
+#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
+SECURITY_READ_ONLY_LATE(uint64_t) ctrr_ro_test;
+uint64_t ctrr_nx_test = 0xd65f03c0; /* RET */
+volatile uint64_t ctrr_exception_esr;
+vm_offset_t ctrr_test_va;
+vm_offset_t ctrr_test_page;
+
+kern_return_t
+ctrr_test(void)
+{
+       processor_t p;
+       boolean_t ctrr_disable = FALSE;
+
+       PE_parse_boot_argn("-unsafe_kernel_text", &ctrr_disable, sizeof(ctrr_disable));
+
+       if (ctrr_disable) {
+               T_LOG("Skipping CTRR test when -unsafe_kernel_text boot-arg present");
+               return KERN_SUCCESS;
+       }
+
+       T_LOG("Running CTRR test.");
+
+       for (p = processor_list; p != NULL; p = p->processor_list) {
+               thread_bind(p);
+               thread_block(THREAD_CONTINUE_NULL);
+               T_LOG("Running CTRR test on cpu %d\n", p->cpu_id);
+               ctrr_test_cpu();
+       }
+
+       /* unbind thread from specific cpu */
+       thread_bind(PROCESSOR_NULL);
+       thread_block(THREAD_CONTINUE_NULL);
+
+       return KERN_SUCCESS;
+}
+
+/* test CTRR on a cpu, caller to bind thread to desired cpu */
+/* ctrr_test_page was reserved during bootstrap process */
+kern_return_t
+ctrr_test_cpu(void)
+{
+       ppnum_t ro_pn, nx_pn;
+       uint64_t *ctrr_ro_test_ptr;
+       void (*ctrr_nx_test_ptr)(void);
+       kern_return_t kr;
+       uint64_t prot = 0;
+       extern uint64_t rorgn_begin, rorgn_end;
+       extern vm_offset_t virtual_space_start;
+
+       /* rorgn = [rorgn_begin_va, rorgn_end_va) */
+
+       vm_offset_t rorgn_begin_va = phystokv(rorgn_begin);
+       vm_offset_t rorgn_end_va = phystokv(rorgn_end) + PAGE_SIZE;
+       vm_offset_t ro_test_va = (vm_offset_t)&ctrr_ro_test;
+       vm_offset_t nx_test_va = (vm_offset_t)&ctrr_nx_test;
+
+       T_EXPECT(rorgn_begin_va <= ro_test_va && ro_test_va < rorgn_end_va, "Expect ro_test_va to be inside the CTRR region");
+       T_EXPECT((nx_test_va < rorgn_begin_va) ^ (nx_test_va >= rorgn_end_va), "Expect nx_test_va to be outside the CTRR region");
+
+       ro_pn = pmap_find_phys(kernel_pmap, ro_test_va);
+       nx_pn = pmap_find_phys(kernel_pmap, nx_test_va);
+       T_EXPECT(ro_pn && nx_pn, "Expect ro page number and nx page number to be non zero");
+
+       T_LOG("test virtual page: %p, ctrr_ro_test: %p, ctrr_nx_test: %p, ro_pn: %x, nx_pn: %x ",
+           (void *)ctrr_test_page, &ctrr_ro_test, &ctrr_nx_test, ro_pn, nx_pn);
+
+       prot = pmap_get_arm64_prot(kernel_pmap, ctrr_test_page);
+       T_EXPECT(~prot & ARM_TTE_VALID, "Expect ctrr_test_page to be unmapped");
+
+       T_LOG("Read only region test mapping virtual page %p to CTRR RO page number %d", ctrr_test_page, ro_pn);
+       kr = pmap_enter(kernel_pmap, ctrr_test_page, ro_pn,
+           VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       T_EXPECT(kr == KERN_SUCCESS, "Expect pmap_enter of RW mapping to succeed");
+
+       // assert entire mmu prot path (Hierarchical protection model) is NOT RO
+       // fetch effective block level protections from table/block entries
+       prot = pmap_get_arm64_prot(kernel_pmap, ctrr_test_page);
+       T_EXPECT(ARM_PTE_EXTRACT_AP(prot) == AP_RWNA && (prot & ARM_PTE_PNX), "Mapping is EL1 RWNX");
+
+       ctrr_test_va = ctrr_test_page + (ro_test_va & PAGE_MASK);
+       ctrr_ro_test_ptr = (void *)ctrr_test_va;
+
+       T_LOG("Read only region test writing to %p to provoke data abort", ctrr_ro_test_ptr);
+
+       // should cause data abort
+       *ctrr_ro_test_ptr = 1;
+
+       // ensure write permission fault at expected level
+       // data abort handler will set ctrr_exception_esr when ctrr_test_va takes a permission fault
+
+       T_EXPECT(ESR_EC(ctrr_exception_esr) == ESR_EC_DABORT_EL1, "Data Abort from EL1 expected");
+       T_EXPECT(ISS_DA_FSC(ESR_ISS(ctrr_exception_esr)) == FSC_PERMISSION_FAULT_L3, "Permission Fault Expected");
+       T_EXPECT(ESR_ISS(ctrr_exception_esr) & ISS_DA_WNR, "Write Fault Expected");
+
+       ctrr_test_va = 0;
+       ctrr_exception_esr = 0;
+       pmap_remove(kernel_pmap, ctrr_test_page, ctrr_test_page + PAGE_SIZE);
+
+       T_LOG("No execute test mapping virtual page %p to CTRR PXN page number %d", ctrr_test_page, nx_pn);
+
+       kr = pmap_enter(kernel_pmap, ctrr_test_page, nx_pn,
+           VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       T_EXPECT(kr == KERN_SUCCESS, "Expect pmap_enter of RX mapping to succeed");
+
+       // assert entire mmu prot path (Hierarchical protection model) is NOT XN
+       prot = pmap_get_arm64_prot(kernel_pmap, ctrr_test_page);
+       T_EXPECT(ARM_PTE_EXTRACT_AP(prot) == AP_RONA && (~prot & ARM_PTE_PNX), "Mapping is EL1 ROX");
+
+       ctrr_test_va = ctrr_test_page + (nx_test_va & PAGE_MASK);
+       ctrr_nx_test_ptr = (void *)ctrr_test_va;
+
+       T_LOG("No execute test calling ctrr_nx_test_ptr(): %p to provoke instruction abort", ctrr_nx_test_ptr);
+
+#if __has_feature(ptrauth_calls)
+       // must sign before calling if we're creating function pointers out of thin air
+       ctrr_nx_test_ptr = ptrauth_sign_unauthenticated(ctrr_nx_test_ptr, ptrauth_key_function_pointer, 0);
+#endif
+       // should cause prefetch abort
+       ctrr_nx_test_ptr();
+
+       // TODO: ensure execute permission fault at expected level
+       T_EXPECT(ESR_EC(ctrr_exception_esr) == ESR_EC_IABORT_EL1, "Instruction abort from EL1 Expected");
+       T_EXPECT(ISS_DA_FSC(ESR_ISS(ctrr_exception_esr)) == FSC_PERMISSION_FAULT_L3, "Permission Fault Expected");
+
+       ctrr_test_va = 0;
+       ctrr_exception_esr = 0;
+       pmap_remove(kernel_pmap, ctrr_test_page, ctrr_test_page + PAGE_SIZE);
+       return KERN_SUCCESS;
+}
+#endif /* defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) */
 
 #if HAS_TWO_STAGE_SPR_LOCK
 
index 66a551ffed01e8d1a37d0bb5e7540d22c47c4680..f4d967d1461bf13bb0575c9bd709b28fe43bd538 100644 (file)
  * global mappings would be visible to userspace unless we invalidate them on
  * eret.
  */
+#if XNU_MONITOR
+/*
+ * Please note that because we indirect through the thread register in order to
+ * locate the kernel, and because we unmap most of the kernel, the security
+ * model of the PPL is undermined by __ARM_KERNEL_PROTECT__, as we rely on
+ * kernel controlled data to direct codeflow in the exception vectors.
+ *
+ * If we want to ship XNU_MONITOR paired with __ARM_KERNEL_PROTECT__, we will
+ * need to find a performant solution to this problem.
+ */
+#endif
 #endif /* __ARM_KERNEL_PROTECT */
 
 /*
@@ -1552,13 +1563,223 @@ typedef enum {
 #define CORESIGHT_REGIONS   4
 #define CORESIGHT_SIZE      0x1000
 
+#if __APRR_SUPPORTED__
+/*
+ * APRR_EL0/APRR_EL1
+ *
+ *  63                 0
+ * +--------------------+
+ * | Attr[15:0]RWX[3:0] |
+ * +--------------------+
+ *
+ * These registers consist of 16 4-bit fields.
+ *
+ * The attribute index consists of the access protection
+ * and execution protections on a mapping.  The index
+ * for a given mapping type is constructed as follows.
+ *
+ * Attribute Index
+ *
+ *     3       2      1     0
+ * +-------+-------+-----+----+
+ * | AP[1] | AP[0] | PXN | XN |
+ * +-------+-------+-----+----+
+ *
+ * The attribute for a given index determines what
+ * protections are disabled for that mappings type
+ * (protections beyond the scope of the standard ARM
+ * protections for a mapping cannot be granted via
+ * APRR).
+ *
+ * Attribute
+ *
+ *       3      2   1   0
+ * +----------+---+---+---+
+ * | Reserved | R | W | X |
+ * +----------+---+---+---+
+ *
+ * Where:
+ *   R: Read is allowed.
+ *   W: Write is allowed.
+ *   X: Execute is allowed.
+ */
+
+#define APRR_IDX_XN  (1ULL)
+#define APRR_IDX_PXN (2ULL)
+
+
+#define APRR_IDX_XN_SHIFT (0ULL)
+#define APRR_IDX_PXN_SHIFT  (1ULL)
+#define APRR_IDX_APSHIFT   (2ULL)
+
+#endif /* __APRR_SUPPORTED__ */
+
+
+#if __APRR_SUPPORTED__
+
+#define APRR_ATTR_X (1ULL)
+#define APRR_ATTR_W (2ULL)
+#define APRR_ATTR_R (4ULL)
+
+#define APRR_ATTR_WX  (APRR_ATTR_W | APRR_ATTR_X)
+#define APRR_ATTR_RX  (APRR_ATTR_R | APRR_ATTR_X)
+#define APRR_ATTR_RWX (APRR_ATTR_R | APRR_ATTR_W | APRR_ATTR_X)
+
+#define APRR_ATTR_NONE (0ULL)
+#define APRR_ATTR_MASK (APRR_ATTR_RWX)
+
+#define APRR_RESERVED_MASK (0x8888888888888888ULL)
+#endif /* __APRR_SUPPORTED__ */
+
+#if __APRR_SUPPORTED__
+#define XPRR_FIRM_RX_PERM  (0ULL)
+#define XPRR_PPL_RW_PERM   (1ULL)
+#define XPRR_FIRM_RO_PERM  (2ULL)
+#define XPRR_KERN_RW_PERM  (3ULL)
+#define XPRR_FIRM_RW_PERM  (4ULL)
+#define XPRR_USER_JIT_PERM (5ULL)
+#define XPRR_KERN0_RW_PERM (6ULL)
+#define XPRR_USER_RW_PERM  (7ULL)
+#define XPRR_PPL_RX_PERM   (8ULL)
+#define XPRR_PPL_RO_PERM   (9ULL)
+#define XPRR_KERN_RX_PERM  (10ULL)
+#define XPRR_KERN_RO_PERM  (11ULL)
+#define XPRR_KERN0_RX_PERM (12ULL)
+#define XPRR_USER_RX_PERM  (13ULL)
+#define XPRR_KERN0_RO_PERM (14ULL)
+#define XPRR_USER_RO_PERM  (15ULL)
+#define XPRR_MAX_PERM      (15ULL)
+
+#define XPRR_VERSION_NONE    (0ULL)
+#define XPRR_VERSION_APRR    (1ULL)
+
+
+#endif /* __APRR_SUPPORTED__*/
+
+#if __APRR_SUPPORTED__
+/* Indices for attributes, named based on how we intend to use them. */
+#define APRR_FIRM_RX_INDEX  (0ULL)  /* AP_RWNA, PX, X */
+#define APRR_FIRM_RO_INDEX  (1ULL)  /* AP_RWNA, PX, XN */
+#define APRR_PPL_RW_INDEX   (2ULL)  /* AP_RWNA, PXN, X */
+#define APRR_KERN_RW_INDEX  (3ULL)  /* AP_RWNA, PXN, XN */
+#define APRR_FIRM_RW_INDEX  (4ULL)  /* AP_RWRW, PX, X */
+#define APRR_KERN0_RW_INDEX (5ULL)  /* AP_RWRW, PX, XN */
+#define APRR_USER_JIT_INDEX (6ULL)  /* AP_RWRW, PXN, X */
+#define APRR_USER_RW_INDEX  (7ULL)  /* AP_RWRW, PXN, XN */
+#define APRR_PPL_RX_INDEX   (8ULL)  /* AP_RONA, PX, X */
+#define APRR_KERN_RX_INDEX  (9ULL)  /* AP_RONA, PX, XN */
+#define APRR_PPL_RO_INDEX   (10ULL) /* AP_RONA, PXN, X */
+#define APRR_KERN_RO_INDEX  (11ULL) /* AP_RONA, PXN, XN */
+#define APRR_KERN0_RX_INDEX (12ULL) /* AP_RORO, PX, X */
+#define APRR_KERN0_RO_INDEX (13ULL) /* AP_RORO, PX, XN */
+#define APRR_USER_RX_INDEX  (14ULL) /* AP_RORO, PXN, X */
+#define APRR_USER_RO_INDEX  (15ULL) /* AP_RORO, PXN, XN */
+#define APRR_MAX_INDEX      (15ULL) /* For sanity checking index values */
+#endif /* __APRR_SUPPORTED */
+
+
+#if __APRR_SUPPORTED__
+#define APRR_SHIFT_FOR_IDX(x) \
+       ((x) << 2ULL)
+
+/* Shifts for attributes, named based on how we intend to use them. */
+#define APRR_FIRM_RX_SHIFT  (0ULL)  /* AP_RWNA, PX, X */
+#define APRR_FIRM_RO_SHIFT  (4ULL)  /* AP_RWNA, PX, XN */
+#define APRR_PPL_RW_SHIFT   (8ULL)  /* AP_RWNA, PXN, X */
+#define APRR_KERN_RW_SHIFT  (12ULL) /* AP_RWNA, PXN, XN */
+#define APRR_FIRM_RW_SHIFT  (16ULL) /* AP_RWRW, PX, X */
+#define APRR_KERN0_RW_SHIFT (20ULL) /* AP_RWRW, PX, XN */
+#define APRR_USER_JIT_SHIFT (24ULL) /* AP_RWRW, PXN, X */
+#define APRR_USER_RW_SHIFT  (28ULL) /* AP_RWRW, PXN, XN */
+#define APRR_PPL_RX_SHIFT   (32ULL) /* AP_RONA, PX, X */
+#define APRR_KERN_RX_SHIFT  (36ULL) /* AP_RONA, PX, XN */
+#define APRR_PPL_RO_SHIFT   (40ULL) /* AP_RONA, PXN, X */
+#define APRR_KERN_RO_SHIFT  (44ULL) /* AP_RONA, PXN, XN */
+#define APRR_KERN0_RX_SHIFT (48ULL) /* AP_RORO, PX, X */
+#define APRR_KERN0_RO_SHIFT (52ULL) /* AP_RORO, PX, XN */
+#define APRR_USER_RX_SHIFT  (56ULL) /* AP_RORO, PXN, X */
+#define APRR_USER_RO_SHIFT  (60ULL) /* AP_RORO, PXN, XN */
+
+#define ARM_PTE_APRR_MASK \
+       (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)
+
+#define ARM_PTE_XPRR_MASK ARM_PTE_APRR_MASK
+
+#define APRR_INDEX_TO_PTE(x) \
+       ((pt_entry_t) \
+        (((x) & 0x8) ? ARM_PTE_AP(0x2) : 0) | \
+        (((x) & 0x4) ? ARM_PTE_AP(0x1) : 0) | \
+        (((x) & 0x2) ? ARM_PTE_PNX : 0) | \
+        (((x) & 0x1) ? ARM_PTE_NX : 0))
+
+#define PTE_TO_APRR_INDEX(x) \
+       ((ARM_PTE_EXTRACT_AP(x) << APRR_IDX_APSHIFT) | \
+       (((x) & ARM_PTE_PNXMASK) ? APRR_IDX_PXN : 0) | \
+       (((x) & ARM_PTE_NXMASK) ? APRR_IDX_XN : 0))
+
+#endif /* __APRR_SUPPORTED__ */
+
+#if __APRR_SUPPORTED__
+
+#define APRR_EXTRACT_IDX_ATTR(_aprr_value, _idx) \
+       (((_aprr_value) >> APRR_SHIFT_FOR_IDX(_idx)) & APRR_ATTR_MASK)
+
+#define APRR_REMOVE(x) (~(x))
+
+#define APRR_EL1_UNRESTRICTED (0x4455445566666677ULL)
+
+#define APRR_EL1_RESET \
+       APRR_EL1_UNRESTRICTED
+
+#define APRR_EL1_BASE \
+       APRR_EL1_UNRESTRICTED
+
+#if XNU_MONITOR
+#define APRR_EL1_DEFAULT \
+       (APRR_EL1_BASE & \
+        (APRR_REMOVE((APRR_ATTR_WX << APRR_PPL_RW_SHIFT) | \
+        (APRR_ATTR_WX << APRR_PPL_RO_SHIFT) | \
+        (APRR_ATTR_WX << APRR_PPL_RX_SHIFT))))
+
+#define APRR_EL1_PPL \
+       (APRR_EL1_BASE & \
+        (APRR_REMOVE((APRR_ATTR_X << APRR_PPL_RW_SHIFT) | \
+        (APRR_ATTR_WX << APRR_PPL_RO_SHIFT) | \
+        (APRR_ATTR_W << APRR_PPL_RX_SHIFT))))
+#else
+#define APRR_EL1_DEFAULT \
+       APRR_EL1_BASE
+#endif
 
+#define APRR_EL0_UNRESTRICTED (0x4545010167670101ULL)
 
+#define APRR_EL0_RESET \
+       APRR_EL0_UNRESTRICTED
 
+#if XNU_MONITOR
+#define APRR_EL0_BASE \
+       (APRR_EL0_UNRESTRICTED & \
+        (APRR_REMOVE((APRR_ATTR_RWX << APRR_PPL_RW_SHIFT) | \
+        (APRR_ATTR_RWX << APRR_PPL_RX_SHIFT) | \
+        (APRR_ATTR_RWX << APRR_PPL_RO_SHIFT))))
+#else
+#define APRR_EL0_BASE \
+       APRR_EL0_UNRESTRICTED
+#endif
 
+#define APRR_EL0_JIT_RW \
+       (APRR_EL0_BASE & APRR_REMOVE(APRR_ATTR_X << APRR_USER_JIT_SHIFT))
 
+#define APRR_EL0_JIT_RX \
+       (APRR_EL0_BASE & APRR_REMOVE(APRR_ATTR_W << APRR_USER_JIT_SHIFT))
 
+#define APRR_EL0_JIT_RWX \
+       APRR_EL0_BASE
 
+#define APRR_EL0_DEFAULT \
+       APRR_EL0_BASE
+
+#endif /* __APRR_SUPPORTED__ */
 
 
 /*
@@ -1694,5 +1915,12 @@ b.mi $2                         // Unsigned "strictly less than"
 #define MSR(reg, src)  __asm__ volatile ("msr " reg ", %0" :: "r" (src))
 #define MRS(dest, reg) __asm__ volatile ("mrs %0, " reg : "=r" (dest))
 
+#if XNU_MONITOR
+#define __ARM_PTE_PHYSMAP__ 1
+#define PPL_STATE_KERNEL    0
+#define PPL_STATE_DISPATCH  1
+#define PPL_STATE_PANIC     2
+#define PPL_STATE_EXCEPTION 3
+#endif
 
 #endif /* _ARM64_PROC_REG_H_ */
index 705e31444adf02a3d31bfea04c397b22725f2141..b6a1f10aede0d5e6e717795114e706020b76e95b 100644 (file)
@@ -203,6 +203,8 @@ extern volatile uint32_t spr_lock_exception_esr;
 #define CPU_NAME "Twister"
 #elif defined(APPLEHURRICANE)
 #define CPU_NAME "Hurricane"
+#elif defined(APPLELIGHTNING)
+#define CPU_NAME "Lightning"
 #else
 #define CPU_NAME "Unknown"
 #endif
@@ -222,6 +224,10 @@ extern volatile uint32_t spr_lock_exception_esr;
 #define WT_REASON_REG_VIOLATION  8
 #endif
 
+#if defined(HAS_IPI)
+void cpu_signal_handler(void);
+extern unsigned int gFastIPI;
+#endif /* defined(HAS_IPI) */
 
 extern vm_offset_t static_memory_end;
 
@@ -502,6 +508,18 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
                thread_exception_return();
 
        case ESR_EC_IABORT_EL1:
+#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
+               {
+                       extern volatile vm_offset_t ctrr_test_va;
+                       if (ctrr_test_va && far == ctrr_test_va) {
+                               extern volatile uint64_t ctrr_exception_esr;
+                               ctrr_exception_esr = esr;
+                               /* return to the instruction immediately after the call to NX page */
+                               set_saved_state_pc(state, get_saved_state_lr(state));
+                               break;
+                       }
+               }
+#endif
 
                panic_with_thread_kernel_state("Kernel instruction fetch abort", state);
 
@@ -944,7 +962,7 @@ is_translation_fault(fault_status_t status)
        }
 }
 
-#if __ARM_PAN_AVAILABLE__
+#if __ARM_PAN_AVAILABLE__ || defined(KERNEL_INTEGRITY_CTRR)
 static int
 is_permission_fault(fault_status_t status)
 {
@@ -1189,6 +1207,15 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
                 * when running with KTRR.
                 */
 
+#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
+               extern volatile vm_offset_t ctrr_test_va;
+               if (ctrr_test_va && fault_addr == ctrr_test_va && is_permission_fault(fault_code)) {
+                       extern volatile uint64_t ctrr_exception_esr;
+                       ctrr_exception_esr = esr;
+                       add_saved_state_pc(state, 4);
+                       return;
+               }
+#endif
 
 #if __ARM_PAN_AVAILABLE__ && defined(CONFIG_XNUPOST)
                if (is_permission_fault(fault_code) && !(get_saved_state_cpsr(state) & PSR64_PAN) &&
@@ -1497,6 +1524,22 @@ sleh_fiq(arm_saved_state_t *state)
        uint64_t pmcr0 = 0, upmsr = 0;
 #endif /* MONOTONIC_FIQ */
 
+#if defined(HAS_IPI)
+       boolean_t    is_ipi = FALSE;
+       uint64_t     ipi_sr = 0;
+
+       if (gFastIPI) {
+               MRS(ipi_sr, ARM64_REG_IPI_SR);
+
+               if (ipi_sr & 1) {
+                       is_ipi = TRUE;
+               }
+       }
+
+       if (is_ipi) {
+               type = DBG_INTR_TYPE_IPI;
+       } else
+#endif /* defined(HAS_IPI) */
 #if MONOTONIC_FIQ
        if (mt_pmi_pending(&pmcr0, &upmsr)) {
                type = DBG_INTR_TYPE_PMI;
@@ -1508,6 +1551,21 @@ sleh_fiq(arm_saved_state_t *state)
 
        sleh_interrupt_handler_prologue(state, type);
 
+#if defined(HAS_IPI)
+       if (is_ipi) {
+               /*
+                * Order is important here: we must ack the IPI by writing IPI_SR
+                * before we call cpu_signal_handler().  Otherwise, there will be
+                * a window between the completion of pending-signal processing in
+                * cpu_signal_handler() and the ack during which a newly-issued
+                * IPI to this CPU may be lost.  ISB is required to ensure the msr
+                * is retired before execution of cpu_signal_handler().
+                */
+               MSR(ARM64_REG_IPI_SR, ipi_sr);
+               __builtin_arm_isb(ISB_SY);
+               cpu_signal_handler();
+       } else
+#endif /* defined(HAS_IPI) */
 #if MONOTONIC_FIQ
        if (type == DBG_INTR_TYPE_PMI) {
                mt_fiq(getCpuDatap(), pmcr0, upmsr);
index 4e964ca8bce3b519c6b6c7b36bab23721d55e8cc..a5d29d6c6a3d0a261077ebb11f24de83002194da 100644 (file)
 #endif /* __ARM_KERNEL_PROTECT__ */
 
 
+#if __APRR_SUPPORTED__
+
+.macro MSR_APRR_EL1_X0
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
+       bl              EXT(pinst_set_aprr_el1)
+#else
+       msr             APRR_EL1, x0
+#endif
+.endmacro
+
+.macro MSR_APRR_EL0_X0
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
+       bl              EXT(pinst_set_aprr_el0)
+#else
+       msr             APRR_EL0, x0
+#endif
+.endmacro
+
+.macro MSR_APRR_SHADOW_MASK_EN_EL1_X0
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
+       bl              EXT(pinst_set_aprr_shadow_mask_en_el1)
+#else
+       msr             APRR_SHADOW_MASK_EN_EL1, x0
+#endif
+.endmacro
+
+#endif /* __APRR_SUPPORTED__ */
 
 .macro MSR_VBAR_EL1_X0
 #if defined(KERNEL_INTEGRITY_KTRR)
@@ -128,13 +155,32 @@ LEXT(reset_vector)
        msr             OSLAR_EL1, xzr
        msr             DAIFSet, #(DAIFSC_ALL)                          // Disable all interrupts
 
-#if !(defined(KERNEL_INTEGRITY_KTRR))
+#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR))
        // Set low reset vector before attempting any loads
        adrp    x0, EXT(LowExceptionVectorBase)@page
        add     x0, x0, EXT(LowExceptionVectorBase)@pageoff
        msr     VBAR_EL1, x0
 #endif
 
+#if __APRR_SUPPORTED__
+       MOV64   x0, APRR_EL1_DEFAULT
+#if XNU_MONITOR
+       adrp    x4, EXT(pmap_ppl_locked_down)@page
+       ldrb    w5, [x4, #EXT(pmap_ppl_locked_down)@pageoff]
+       cmp             w5, #0
+       b.ne    1f
+
+       // If the PPL is not locked down, we start in PPL mode.
+       MOV64   x0, APRR_EL1_PPL
+1:
+#endif /* XNU_MONITOR */
+
+       MSR_APRR_EL1_X0
+
+       // Load up the default APRR_EL0 value.
+       MOV64   x0, APRR_EL0_DEFAULT
+       MSR_APRR_EL0_X0
+#endif /* __APRR_SUPPORTED__ */
 
 #if defined(KERNEL_INTEGRITY_KTRR)
        /*
@@ -179,7 +225,11 @@ Lskip_ktrr:
        adrp    x19, EXT(ResetHandlerData)@page                 // Get address of the reset handler data
        add             x19, x19, EXT(ResetHandlerData)@pageoff
        mrs             x15, MPIDR_EL1                                          // Load MPIDR to get CPU number
+#if HAS_CLUSTER
+       and             x0, x15, #0xFFFF                                        // CPU number in Affinity0, cluster ID in Affinity1
+#else
        and             x0, x15, #0xFF                                          // CPU number is in MPIDR Affinity Level 0
+#endif
        ldr             x1, [x19, CPU_DATA_ENTRIES]                     // Load start of data entries
        add             x3, x1, MAX_CPUS * 16                           // end addr of data entries = start + (16 * MAX_CPUS)  
 Lcheck_cpu_data_entry:
@@ -194,6 +244,57 @@ Lnext_cpu_data_entry:
        b.eq    Lskip_cpu_reset_handler                         // Not found
        b               Lcheck_cpu_data_entry   // loop
 Lfound_cpu_data_entry:
+#if defined(KERNEL_INTEGRITY_CTRR)
+       /*
+        * Program and lock CTRR if this CPU is non-boot cluster master. boot cluster will be locked
+        * in machine_lockdown. pinst insns protected by VMSA_LOCK
+        * A_PXN and A_MMUON_WRPROTECT options provides something close to KTRR behavior
+        */
+
+       /* spin until bootstrap core has completed machine lockdown */
+       adrp    x17, EXT(lockdown_done)@page
+1:
+       ldr     x18, [x17, EXT(lockdown_done)@pageoff]
+       cbz     x18, 1b
+
+       // load stashed rorgn_begin
+       adrp    x17, EXT(rorgn_begin)@page
+       add             x17, x17, EXT(rorgn_begin)@pageoff
+       ldr             x17, [x17]
+       // if rorgn_begin is zero, we're debugging. skip enabling ctrr
+       cbz             x17, Lskip_ctrr
+
+       // load stashed rorgn_end
+       adrp    x19, EXT(rorgn_end)@page
+       add             x19, x19, EXT(rorgn_end)@pageoff
+       ldr             x19, [x19]
+       cbz             x19, Lskip_ctrr
+
+       mrs             x18, ARM64_REG_CTRR_LOCK_EL1
+       cbnz    x18, Lskip_ctrr  /* don't touch if already locked */
+       ldr             w18, [x21, CLUSTER_MASTER] /* cluster master is unsigned int (32bit) */
+       cbz             w18, Lspin_ctrr_unlocked /* non-cluster master spins if CTRR unlocked (unexpected) */
+       msr             ARM64_REG_CTRR_A_LWR_EL1, x17
+       msr             ARM64_REG_CTRR_A_UPR_EL1, x19
+       mov             x18, #(CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT)
+       msr             ARM64_REG_CTRR_CTL_EL1, x18
+       mov             x18, #1
+       msr             ARM64_REG_CTRR_LOCK_EL1, x18
+
+
+       isb
+       tlbi    vmalle1
+       dsb     ish
+       isb
+Lspin_ctrr_unlocked:
+       /* we shouldn't ever be here as cpu start is serialized by cluster in cpu_start(),
+        * and first core started in cluster is designated cluster master and locks
+        * both core and cluster. subsequent cores in same cluster will run locked from
+        * from reset vector */
+       mrs             x18, ARM64_REG_CTRR_LOCK_EL1
+       cbz             x18, Lspin_ctrr_unlocked
+Lskip_ctrr:
+#endif
        adrp    x20, EXT(const_boot_args)@page
        add             x20, x20, EXT(const_boot_args)@pageoff
        ldr             x0, [x21, CPU_RESET_HANDLER]            // Call CPU reset handler
@@ -210,7 +311,13 @@ Lfound_cpu_data_entry:
        bne             Lskip_cpu_reset_handler
 1:
 
+#if HAS_NEX_PG
+       bl              EXT(set_nex_pg)
+#endif
 
+#if HAS_BP_RET
+       bl              EXT(set_bp_ret)
+#endif
 
 #if __ARM_KERNEL_PROTECT__ && defined(KERNEL_INTEGRITY_KTRR)
        /*
@@ -299,7 +406,7 @@ LEXT(LowExceptionVectorBase)
        b               .
        .align 12, 0
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
 /*
  * Provide a global symbol so that we can narrow the V=P mapping to cover
  * this page during arm_vm_init.
@@ -308,7 +415,7 @@ LEXT(LowExceptionVectorBase)
 .globl EXT(bootstrap_instructions)
 LEXT(bootstrap_instructions)
 
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
        .align 2
        .globl EXT(resume_idle_cpu)
 LEXT(resume_idle_cpu)
@@ -325,13 +432,13 @@ LEXT(start_cpu)
 
        .align 2
 start_cpu:
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        // This is done right away in reset vector for pre-KTRR devices
        // Set low reset vector now that we are in the KTRR-free zone
        adrp    x0, EXT(LowExceptionVectorBase)@page
        add             x0, x0, EXT(LowExceptionVectorBase)@pageoff
        MSR_VBAR_EL1_X0
-#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
+#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
 
        // x20 set to BootArgs phys address
        // x21 set to cpu data phys address
@@ -353,7 +460,7 @@ start_cpu:
 
 
        // Set SP_EL1 to exception stack
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        mov             x1, lr
        bl              EXT(pinst_spsel_1)
        mov             lr, x1
@@ -494,6 +601,29 @@ LEXT(start_first_cpu)
        add             x0, x0, EXT(LowExceptionVectorBase)@pageoff
        MSR_VBAR_EL1_X0
 
+#if __APRR_SUPPORTED__
+       // Save the LR
+       mov             x1, lr
+
+#if XNU_MONITOR
+       // If the PPL is supported, we start out in PPL mode.
+       MOV64   x0, APRR_EL1_PPL
+#else
+       // Otherwise, we start out in default mode.
+       MOV64   x0, APRR_EL1_DEFAULT
+#endif
+
+       // Set the APRR state for EL1.
+       MSR_APRR_EL1_X0
+
+       // Set the APRR state for EL0.
+       MOV64   x0, APRR_EL0_DEFAULT
+       MSR_APRR_EL0_X0
+
+
+       // Restore the LR.
+       mov     lr, x1
+#endif /* __APRR_SUPPORTED__ */
 
        // Get the kernel memory parameters from the boot args
        ldr             x22, [x20, BA_VIRT_BASE]                        // Get the kernel virt base
@@ -514,7 +644,7 @@ LEXT(start_first_cpu)
        sub             x0, x0, x23
 
        // Set SP_EL1 to exception stack
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        bl              EXT(pinst_spsel_1)
 #else
        msr             SPSel, #1
@@ -657,7 +787,7 @@ common_start:
         *      TTBR0 - V=P table @ top of kernel
         *      TTBR1 - KVA table @ top of kernel + 1 page
         */
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        /* Note that for KTRR configurations, the V=P map will be modified by
         * arm_vm_init.c.
         */
@@ -1075,9 +1205,186 @@ Lskip_skye_post_a1_workarounds:
 
 #endif /* defined(APPLEMONSOON) */
 
+#if defined(APPLEVORTEX)
 
+       ARM64_IS_PCORE x15
 
+       // Skip if not P-core
+       cbz             x15, Lskip_cyprus_pcore_only
 
+       mrs             x12, ARM64_REG_HID1
+
+       mrs             x13, MIDR_EL1
+       ubfx            x14, x13, #MIDR_EL1_PNUM_SHIFT, #12
+       // Should be applied to all Aruba variants, but only Cyprus variants B0 and later
+       cmp             x14, #0xb       // Part number 11 => Cyprus, 16 => Aruba
+       bne             Lbr_kill
+       ubfx            x14, x13, #MIDR_EL1_VAR_SHIFT, #4
+       cbz             x14, Lskip_br_kill              // variant 0 => Cyprus AX, 1 => Cyprus BX
+
+Lbr_kill:
+
+       // rdar://problem/36716477: data corruption due to incorrect branch predictor resolution
+       orr             x12, x12, ARM64_REG_HID1_enaBrKillLimit
+
+Lskip_br_kill:
+
+       // rdar://problem/34435356: segfaults due to IEX clock-gating
+       orr             x12, x12, ARM64_REG_HID1_rccForceAllIexL3ClksOn
+       msr             ARM64_REG_HID1, x12
+
+#if ARM64_BOARD_CONFIG_T8027
+       // rdar://problem/40695685: Enable BIF fill buffer stall logic to prevent skid buffer overflow (Aruba A1 only)
+       mrs             x12, ARM64_REG_HID5
+       orr             x12, x12, ARM64_REG_HID5_EnableDnFIFORdStall
+       msr             ARM64_REG_HID5, x12
+
+#endif /* ARM64_BOARD_CONFIG_T8027 */
+
+       // Prevent ordered loads from being dispatched from LSU until all prior loads have completed. 
+       // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations
+       mrs             x12, ARM64_REG_HID4
+       orr             x12, x12, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd
+       msr             ARM64_REG_HID4, x12
+
+       // rdar://problem/38482968: [Cyprus Tunable] Poisoned cache line crossing younger load is not redirected by older load-barrier
+       mrs             x12, ARM64_REG_HID3
+       orr             x12, x12, ARM64_REG_HID3_DisColorOpt
+       msr             ARM64_REG_HID3, x12
+
+       // rdar://problem/41056604: disable faster launches of uncacheable unaligned stores to workaround load/load ordering violation
+       mrs             x12, ARM64_REG_HID11
+       orr             x12, x12, ARM64_REG_HID11_DisX64NTLnchOpt
+       msr             ARM64_REG_HID11, x12
+
+       b               Lskip_cyprus_ecore_only
+
+Lskip_cyprus_pcore_only:
+
+       // Prevent ordered loads from being dispatched from LSU until all prior loads have completed. 
+       // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations
+       mrs             x12, ARM64_REG_EHID4
+       orr             x12, x12, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd
+       msr             ARM64_REG_EHID4, x12
+
+       // rdar://problem/36595004: Poisoned younger load is not redirected by older load-acquire
+       mrs             x12, ARM64_REG_EHID3
+       orr             x12, x12, ARM64_REG_EHID3_DisColorOpt
+       msr             ARM64_REG_EHID3, x12
+
+       // rdar://problem/37949166: Disable the extension of prefetcher training pipe clock gating, revert to default gating
+       mrs             x12, ARM64_REG_EHID10
+       orr             x12, x12, ARM64_REG_EHID10_rccDisPwrSavePrfClkOff
+       msr             ARM64_REG_EHID10, x12
+
+Lskip_cyprus_ecore_only:
+
+#endif /* defined (APPLEVORTEX) */
+
+#if defined(ARM64_BOARD_CONFIG_T8030)
+       // Cebu <B0 is deprecated and unsupported (see rdar://problem/42835678)
+       SKIP_IF_CPU_VERSION_LESS_THAN x12, LIGHTNING_CPU_VERSION_B0, .
+
+       ARM64_IS_PCORE x15
+
+       // Skip if not P-core
+       cbz             x15, Lskip_cebu_pcore_only
+
+       // rdar://problem/50664291: [Cebu B0/B1 Tunables][PerfVerif][LSU] Post-silicon tuning of STNT widget contiguous counter threshold
+       mrs             x12, ARM64_REG_HID4
+       and             x12, x12, ~ARM64_REG_HID4_CnfCntrThresh_mask
+       orr             x12, x12, 3 << ARM64_REG_HID4_CnfCntrThresh_shift
+       msr             ARM64_REG_HID4, x12
+
+       mrs             x12, ARM64_REG_HID9
+       // rdar://problem/47744434: Barrier Load Ordering property is not satisfied for x64-loads
+       orr             x12, x12, ARM64_REG_HID9_EnableFixBug47221499
+       // rdar://problem/50664291: [Cebu B0/B1 Tunables][PerfVerif][LSU] Post-silicon tuning of STNT widget contiguous counter threshold
+       orr             x12, x12, ARM64_REG_HID9_DisSTNTWidgetForUnalign
+       msr             ARM64_REG_HID9, x12
+
+       // rdar://problem/47865629: RF bank and Multipass conflict forward progress widget does not handle 3+ cycle livelock
+       mrs             x12, ARM64_REG_HID16
+       orr             x12, x12, ARM64_REG_HID16_EnRs4Sec
+       and             x12, x12, ~ARM64_REG_HID16_DisxPickRs45
+       orr             x12, x12, ARM64_REG_HID16_EnMPxPick45
+       orr             x12, x12, ARM64_REG_HID16_EnMPCyc7
+       msr             ARM64_REG_HID16, x12
+
+       mrs             x12, ARM64_REG_HID4
+       // Prevent ordered loads from being dispatched from LSU until all prior loads have completed.
+       // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations
+       orr             x12, x12, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd
+       // rdar://problem/51690962: Disable Store-Non-Temporal downgrade widget
+       orr             x12, x12, ARM64_REG_HID4_DisSTNTWidget
+       msr             ARM64_REG_HID4, x12
+
+       // rdar://problem/41056604: disable faster launches of uncacheable unaligned stores to workaround load/load ordering violation
+       mrs             x12, ARM64_REG_HID11
+       orr             x12, x12, ARM64_REG_HID11_DisX64NTLnchOpt
+       msr             ARM64_REG_HID11, x12
+
+       // rdar://problem/41029832: configure dummy cycles to work around incorrect temp sensor readings on NEX power gating
+       mrs             x12, ARM64_REG_HID13
+       and             x12, x12, ~ARM64_REG_HID13_PreCyc_mask
+       orr             x12, x12, 4 << ARM64_REG_HID13_PreCyc_shift
+       msr             ARM64_REG_HID13, x12
+
+       // rdar://problem/45024523: enable aggressive LEQ throttling to work around LEQ credit leak
+       mrs             x12, ARM64_REG_HID16
+       orr             x12, x12, ARM64_REG_HID16_leqThrottleAggr
+       msr             ARM64_REG_HID16, x12
+
+       b               Lskip_cebu_ecore_only
+
+Lskip_cebu_pcore_only:
+
+       // Prevent ordered loads from being dispatched from LSU until all prior loads have completed.
+       // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations
+       mrs             x12, ARM64_REG_EHID4
+       orr             x12, x12, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd
+       msr             ARM64_REG_EHID4, x12
+
+       // rdar://problem/37949166: Disable the extension of prefetcher training pipe clock gating, revert to default gating
+       mrs             x12, ARM64_REG_EHID10
+       orr             x12, x12, ARM64_REG_EHID10_rccDisPwrSavePrfClkOff
+       msr             ARM64_REG_EHID10, x12
+
+Lskip_cebu_ecore_only:
+#endif /* defined(ARM64_BOARD_CONFIG_T8030) */
+
+#if defined(APPLELIGHTNING)
+       // rdar://54225210 (Incorrect fusing of a direct branch with AMX/EAS instruction at cross-beat location)
+       ARM64_IS_PCORE x15
+       cbz             x15, not_cebu_pcore
+
+       mrs             x12, ARM64_REG_HID0
+       orr             x12, x12, ARM64_REG_HID0_CacheFusionDisable
+       msr             ARM64_REG_HID0, x12
+
+not_cebu_pcore:
+#endif /* defined(APPLELIGHTNING) */
+
+#if defined(APPLELIGHTNING)
+
+       // rdar://53907283 ([Cebu ACC Errata] Sibling Merge in LLC can cause UC load to violate ARM Memory Ordering Rules.)
+       mrs             x12, ARM64_REG_HID5
+       orr             x12, x12, ARM64_REG_HID5_DisFill2cMerge
+       msr             ARM64_REG_HID5, x12
+
+       // Skip if not E-core or not a two-cluster CPU
+#if defined(CPU_CLUSTER_OFFSETS)
+       ARM64_IS_PCORE x15
+       cbnz    x15, Lskip_h12_h13_ecore_only
+
+       // rdar://problem/48476033: Prevent store-to-load forwarding for UC memory to avoid barrier ordering violation
+       mrs             x12, ARM64_REG_EHID10
+       orr             x12, x12, ARM64_REG_EHID10_ForceWStDrainUc
+       msr             ARM64_REG_EHID10, x12
+
+Lskip_h12_h13_ecore_only:
+#endif /* defined(CPU_CLUSTER_OFFSETS) */
+#endif /* defined(APPLELIGHTNING)*/
 
 
 
@@ -1152,6 +1459,9 @@ arm_init_tramp:
 
 
        mov             x19, lr
+#if defined(HAS_VMSA_LOCK)
+       bl              EXT(vmsa_lock)
+#endif
        // Convert CPU data PA to VA and set as first argument
        mov             x0, x21
        bl              EXT(phystokv)
index 4c6f803a06a924f0f5a5e99c6bec6e0935a14e9a..31d2cbbbd9494f9ec667c31d8e296348c156c500 100644 (file)
@@ -155,6 +155,10 @@ osfmk/kern/processor.c             standard
 osfmk/kern/processor_data.c            standard
 osfmk/kern/restartable.c               standard
 osfmk/kern/sched_average.c             standard
+#ifdef __AMP__
+osfmk/kern/sched_amp.c         optional config_sched_multiq
+osfmk/kern/sched_amp_common.c  optional config_sched_multiq
+#endif
 osfmk/kern/sched_dualq.c       optional config_sched_multiq
 osfmk/kern/sched_clutch.c      optional config_clutch
 osfmk/kern/sched_prim.c                standard
index a324da1721e7771b2879c2459162286cbff71d9c..25da7d7069238e8a8ff71bf9d9c1c4260f4242f7 100644 (file)
@@ -706,6 +706,16 @@ machine_trace_thread64(thread_t thread,
 #endif
                                }
 
+#if XNU_MONITOR
+                               vm_offset_t cpu_base = (vm_offset_t)pmap_stacks_start;
+                               vm_offset_t cpu_top = (vm_offset_t)pmap_stacks_end;
+
+                               if (((prevfp >= cpu_base) && (prevfp < cpu_top)) !=
+                                   ((fp >= cpu_base) && (fp < cpu_top))) {
+                                       switched_stacks = TRUE;
+                                       break;
+                               }
+#endif
                        }
 
                        if (!switched_stacks) {
index 6801e0f310c2646b582c8be2a0ca3bd3db359464..e885cee9b8e526cce3aa9c005df98e0403b0667a 100644 (file)
@@ -87,7 +87,9 @@
 #include <os/log.h>
 
 uint32_t        hz_tick_interval = 1;
+#if !HAS_CONTINUOUS_HWCLOCK
 static uint64_t has_monotonic_clock = 0;
+#endif
 
 decl_simple_lock_data(, clock_lock);
 lck_grp_attr_t * settime_lock_grp_attr;
@@ -234,6 +236,7 @@ bintime2nsclock(const struct bintime *_bt, clock_sec_t *secs, clock_usec_t *nano
        *nanosecs = ((uint64_t)NSEC_PER_SEC * (uint32_t)(_bt->frac >> 32)) >> 32;
 }
 
+#if !defined(HAS_CONTINUOUS_HWCLOCK)
 static __inline void
 bintime2absolutetime(const struct bintime *_bt, uint64_t *abs)
 {
@@ -250,6 +253,7 @@ struct latched_time {
 extern int
 kernel_sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
 
+#endif
 /*
  *     Time of day (calendar) variables.
  *
@@ -270,7 +274,9 @@ static struct clock_calend {
        struct bintime          offset; /* cumulative offset expressed in (sec, 64 bits frac of a second) */
        struct bintime          bintime; /* cumulative offset (it includes bootime) expressed in (sec, 64 bits frac of a second) */
        struct bintime          boottime; /* boot time expressed in (sec, 64 bits frac of a second) */
+#if !HAS_CONTINUOUS_HWCLOCK
        struct bintime          basesleep;
+#endif
 } clock_calend;
 
 static uint64_t ticks_per_sec; /* ticks in a second (expressed in abs time) */
@@ -957,6 +963,7 @@ print_all_clock_variables_internal(const char* func, struct clock_calend* clock_
            func, clock_calend_cp->boottime.sec, clock_calend_cp->boottime.frac,
            (unsigned long)bootime_secs, bootime_microsecs);
 
+#if !HAS_CONTINUOUS_HWCLOCK
        clock_sec_t     basesleep_secs;
        clock_usec_t    basesleep_microsecs;
 
@@ -964,6 +971,7 @@ print_all_clock_variables_internal(const char* func, struct clock_calend* clock_
        os_log(OS_LOG_DEFAULT, "%s basesleep.sec %ld basesleep.frac %llu basesleep_secs %lu basesleep_microsecs %d\n",
            func, clock_calend_cp->basesleep.sec, clock_calend_cp->basesleep.frac,
            (unsigned long)basesleep_secs, basesleep_microsecs);
+#endif
 }
 
 
@@ -1023,6 +1031,7 @@ clock_initialize_calendar(void)
        clock_usec_t            utc_offset_microsecs;
        spl_t                   s;
        struct bintime          bt;
+#if !HAS_CONTINUOUS_HWCLOCK
        struct bintime          monotonic_bt;
        struct latched_time     monotonic_time;
        uint64_t                monotonic_usec_total;
@@ -1030,10 +1039,12 @@ clock_initialize_calendar(void)
        clock_usec_t            microsys2, monotonic_usec;
        size_t                  size;
 
+#endif
        //Get the UTC time and corresponding sys time
        PEGetUTCTimeOfDay(&secs, &microsecs);
        clock_get_system_microtime(&sys, &microsys);
 
+#if !HAS_CONTINUOUS_HWCLOCK
        /*
         * If the platform has a monotonic clock, use kern.monotonicclock_usecs
         * to estimate the sleep/wake time, otherwise use the UTC time to estimate
@@ -1049,6 +1060,7 @@ clock_initialize_calendar(void)
                absolutetime_to_microtime(monotonic_time.mach_time, &sys2, &microsys2);
                os_log(OS_LOG_DEFAULT, "%s system has monotonic clock\n", __func__);
        }
+#endif
 
        s = splclock();
        clock_lock();
@@ -1099,6 +1111,7 @@ clock_initialize_calendar(void)
        clock_calend.s_scale_ns = NSEC_PER_SEC;
        clock_calend.s_adj_nsx = 0;
 
+#if !HAS_CONTINUOUS_HWCLOCK
        if (has_monotonic_clock) {
                monotonic_sec = monotonic_usec_total / (clock_sec_t)USEC_PER_SEC;
                monotonic_usec = monotonic_usec_total % (clock_usec_t)USEC_PER_SEC;
@@ -1111,6 +1124,7 @@ clock_initialize_calendar(void)
                // set the baseleep as the difference between monotonic clock - sys
                clock_calend.basesleep = monotonic_bt;
        }
+#endif
        commpage_update_mach_continuous_time(mach_absolutetime_asleep);
 
 #if DEVELOPMENT || DEBUG
@@ -1132,6 +1146,73 @@ clock_initialize_calendar(void)
 #endif
 }
 
+#if HAS_CONTINUOUS_HWCLOCK
+
+static void
+scale_sleep_time(void)
+{
+       /* Apply the current NTP frequency adjustment to the time slept.
+        * The frequency adjustment remains stable between calls to ntp_adjtime(),
+        * and should thus provide a reasonable approximation of the total adjustment
+        * required for the time slept. */
+       struct bintime sleep_time;
+       uint64_t tick_scale_x, s_scale_ns;
+       int64_t s_adj_nsx;
+       int64_t sleep_adj = ntp_get_freq();
+       if (sleep_adj) {
+               get_scale_factors_from_adj(sleep_adj, &tick_scale_x, &s_scale_ns, &s_adj_nsx);
+               sleep_time = scale_delta(mach_absolutetime_last_sleep, tick_scale_x, s_scale_ns, s_adj_nsx);
+       } else {
+               tick_scale_x = (uint64_t)1 << 63;
+               tick_scale_x /= ticks_per_sec;
+               tick_scale_x *= 2;
+               sleep_time.sec = mach_absolutetime_last_sleep / ticks_per_sec;
+               sleep_time.frac = (mach_absolutetime_last_sleep % ticks_per_sec) * tick_scale_x;
+       }
+       bintime_add(&clock_calend.offset, &sleep_time);
+       bintime_add(&clock_calend.bintime, &sleep_time);
+}
+
+void
+clock_wakeup_calendar(void)
+{
+       spl_t   s;
+
+       s = splclock();
+       clock_lock();
+
+       commpage_disable_timestamp();
+
+       uint64_t abstime = mach_absolute_time();
+       uint64_t total_sleep_time = ml_get_hwclock() - abstime;
+
+       mach_absolutetime_last_sleep = total_sleep_time - mach_absolutetime_asleep;
+       mach_absolutetime_asleep = total_sleep_time;
+
+       scale_sleep_time();
+
+       KERNEL_DEBUG_CONSTANT(
+               MACHDBG_CODE(DBG_MACH_CLOCK, MACH_EPOCH_CHANGE) | DBG_FUNC_NONE,
+               (uintptr_t) mach_absolutetime_last_sleep,
+               (uintptr_t) mach_absolutetime_asleep,
+               (uintptr_t) (mach_absolutetime_last_sleep >> 32),
+               (uintptr_t) (mach_absolutetime_asleep >> 32),
+               0);
+
+       commpage_update_mach_continuous_time(mach_absolutetime_asleep);
+       adjust_cont_time_thread_calls();
+
+       clock_unlock();
+       splx(s);
+
+       host_notify_calendar_change();
+
+#if CONFIG_DTRACE
+       clock_track_calend_nowait();
+#endif
+}
+
+#else /* HAS_CONTINUOUS_HWCLOCK */
 
 void
 clock_wakeup_calendar(void)
@@ -1348,6 +1429,7 @@ done:
 #endif
 }
 
+#endif /* !HAS_CONTINUOUS_HWCLOCK */
 
 /*
  *     clock_get_boottime_nanotime:
@@ -1586,6 +1668,9 @@ clock_deadline_for_periodic_event(
 uint64_t
 mach_continuous_time(void)
 {
+#if HAS_CONTINUOUS_HWCLOCK
+       return ml_get_hwclock();
+#else
        while (1) {
                uint64_t read1 = mach_absolutetime_asleep;
                uint64_t absolute = mach_absolute_time();
@@ -1596,11 +1681,15 @@ mach_continuous_time(void)
                        return absolute + read1;
                }
        }
+#endif
 }
 
 uint64_t
 mach_continuous_approximate_time(void)
 {
+#if HAS_CONTINUOUS_HWCLOCK
+       return ml_get_hwclock();
+#else
        while (1) {
                uint64_t read1 = mach_absolutetime_asleep;
                uint64_t absolute = mach_approximate_time();
@@ -1611,6 +1700,7 @@ mach_continuous_approximate_time(void)
                        return absolute + read1;
                }
        }
+#endif
 }
 
 /*
index a4a617d32c7d55e046b1088332d3aae01c77670b..20c95b23ec331c4765933744c675aaa07f99e5b5 100644 (file)
@@ -1413,13 +1413,15 @@ host_security_self(void)
 }
 
 kern_return_t
-host_set_atm_diagnostic_flag(host_priv_t host_priv, uint32_t diagnostic_flag)
+host_set_atm_diagnostic_flag(host_t host, uint32_t diagnostic_flag)
 {
-       if (host_priv == HOST_PRIV_NULL) {
+       if (host == HOST_NULL) {
                return KERN_INVALID_ARGUMENT;
        }
 
-       assert(host_priv == &realhost);
+       if (!IOTaskHasEntitlement(current_task(), "com.apple.private.set-atm-diagnostic-flag")) {
+               return KERN_NO_ACCESS;
+       }
 
 #if CONFIG_ATM
        return atm_set_diagnostic_config(diagnostic_flag);
index d2e0c17469a9c06fe7392639d33f74d117a9dae2..d29c63124fb87d0aba09e7850dd1f45995a573c7 100644 (file)
@@ -627,8 +627,20 @@ ipc_kobject_alloc_port(
        ipc_kobject_type_t      type,
        ipc_kobject_alloc_options_t     options)
 {
-       ipc_port_t port = ipc_port_alloc_kernel();
+       ipc_port_init_flags_t flags;
+       ipc_space_t space;
+       ipc_port_t port;
 
+       if (options & IPC_KOBJECT_ALLOC_IN_TRANSIT) {
+               /* kobject port intended to be copied out to user-space */
+               flags = IPC_PORT_INIT_MESSAGE_QUEUE;
+               space = IS_NULL;
+       } else {
+               /* true kernel-bound kobject port */
+               flags = IPC_PORT_INIT_NONE;
+               space = ipc_space_kernel;
+       }
+       port = ipc_port_alloc_special(space, flags);
        if (port == IP_NULL) {
                panic("ipc_kobject_alloc_port(): failed to allocate port");
        }
@@ -638,16 +650,29 @@ ipc_kobject_alloc_port(
        if (options & IPC_KOBJECT_ALLOC_MAKE_SEND) {
                ipc_port_make_send_locked(port);
        }
-       if (options & IPC_KOBJECT_ALLOC_NSREQUEST) {
-               ipc_port_make_sonce_locked(port);
-               port->ip_nsrequest = port;
-       }
-       if (options & IPC_KOBJECT_ALLOC_NO_GRANT) {
-               port->ip_no_grant = 1;
+
+       if (options & IPC_KOBJECT_ALLOC_IN_TRANSIT) {
+               /* reset the port like it has been copied in circularity checked */
+               if (options & IPC_KOBJECT_ALLOC_NSREQUEST) {
+                       panic("ipc_kobject_alloc_port(): invalid option for user-space port");
+               }
+               port->ip_mscount = 0;
+               assert(port->ip_tempowner == 0);
+               assert(port->ip_receiver == IS_NULL);
+               port->ip_receiver = IS_NULL;
+               port->ip_receiver_name = MACH_PORT_NULL;
+       } else {
+               if (options & IPC_KOBJECT_ALLOC_NSREQUEST) {
+                       ipc_port_make_sonce_locked(port);
+                       port->ip_nsrequest = port;
+               }
        }
        if (options & IPC_KOBJECT_ALLOC_IMMOVABLE_SEND) {
                port->ip_immovable_send = 1;
        }
+       if (options & IPC_KOBJECT_ALLOC_NO_GRANT) {
+               port->ip_no_grant = 1;
+       }
 
        return port;
 }
index 4431f29ca9f88da1c4df8bf08b9ef995ac3b2648..24913d602da931bbd751ce4e4457e8d1599dcf68 100644 (file)
@@ -174,6 +174,8 @@ __options_decl(ipc_kobject_alloc_options_t, uint32_t, {
        IPC_KOBJECT_ALLOC_NO_GRANT  = 0x00000004,
        /* Make all the send rights immovable */
        IPC_KOBJECT_ALLOC_IMMOVABLE_SEND = 0x00000008,
+       /* Make the port in-transit from the get-go */
+       IPC_KOBJECT_ALLOC_IN_TRANSIT = 0x00000010,
 });
 
 /* Allocates a kobject port, never fails */
index e571487faff1f1dbb696c4f96cfc89ccdd0c90ac..7ff3981a7c10b59e76a7491684e2bdd7ff6a555a 100644 (file)
@@ -664,7 +664,7 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi
        stackshotbuf_size = get_stackshot_estsize(size_hint);
 
        for (; stackshotbuf_size <= max_tracebuf_size; stackshotbuf_size <<= 1) {
-               if (kmem_alloc(kernel_map, (vm_offset_t *)&stackshotbuf, stackshotbuf_size, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) {
+               if (kmem_alloc_flags(kernel_map, (vm_offset_t *)&stackshotbuf, stackshotbuf_size, VM_KERN_MEMORY_DIAG, KMA_ZERO) != KERN_SUCCESS) {
                        error = KERN_RESOURCE_SHORTAGE;
                        goto error_exit;
                }
index 8de9c9012cf57260872fd372b9c30fd0bc629829..e7780c2e32ad49e8d5b2a5f6af05c1599696db83 100644 (file)
@@ -73,29 +73,40 @@ mk_timer_create_trap(
                return MACH_PORT_NULL;
        }
 
-       result = mach_port_allocate_internal(myspace, MACH_PORT_RIGHT_RECEIVE,
-           &mk_timer_qos, &name);
-       if (result == KERN_SUCCESS) {
-               result = ipc_port_translate_receive(myspace, name, &port);
-       }
-
-       if (result != KERN_SUCCESS) {
+       /* Pre-allocate a kmsg for the timer messages */
+       ipc_kmsg_t kmsg;
+       kmsg = ipc_kmsg_prealloc(mk_timer_qos.len + MAX_TRAILER_SIZE);
+       if (kmsg == IKM_NULL) {
                zfree(mk_timer_zone, timer);
-
                return MACH_PORT_NULL;
        }
 
+       /* Allocate an in-transit kobject port with a send right */
+       ipc_kobject_alloc_options_t options;
+       options = (IPC_KOBJECT_ALLOC_IN_TRANSIT | IPC_KOBJECT_ALLOC_MAKE_SEND);
+       port = ipc_kobject_alloc_port((ipc_kobject_t)timer, IKOT_TIMER, options);
+       assert(port != IP_NULL);
+
+       /* Associate the kmsg */
+       ipc_kmsg_set_prealloc(kmsg, port);
+
+       /* Initialize the timer object and bind port to it */
        simple_lock_init(&timer->lock, 0);
        thread_call_setup(&timer->call_entry, mk_timer_expire, timer);
        timer->is_armed = timer->is_dead = FALSE;
        timer->active = 0;
-
        timer->port = port;
-       ipc_kobject_set_atomically(port, (ipc_kobject_t)timer, IKOT_TIMER);
 
-       port->ip_srights++;
-       ip_reference(port);
-       ip_unlock(port);
+       /* Copyout the receive right for the timer port to user-space */
+       current_thread()->ith_knote = ITH_KNOTE_NULL;
+       result = ipc_object_copyout(myspace, ip_to_object(port),
+           MACH_MSG_TYPE_MOVE_RECEIVE,
+           NULL, NULL, &name);
+       if (result != KERN_SUCCESS) {
+               ipc_object_destroy(ip_to_object(port), MACH_MSG_TYPE_MOVE_RECEIVE);
+               /* should trigger mk_timer_port_destroy() call */
+               return MACH_PORT_NULL;
+       }
 
        return name;
 }
index 06e54544c4d332ec32e51518e3000b43c04177d1..faac9b224aedb0ce8dc48086ce38da4198658835 100644 (file)
@@ -147,6 +147,10 @@ typedef enum {
 
 typedef enum {
        PSET_SMP,
+#if __AMP__
+       PSET_AMP_E,
+       PSET_AMP_P,
+#endif
 } pset_cluster_type_t;
 
 typedef bitmap_t cpumap_t;
diff --git a/osfmk/kern/sched_amp.c b/osfmk/kern/sched_amp.c
new file mode 100644 (file)
index 0000000..50c3810
--- /dev/null
@@ -0,0 +1,768 @@
+/*
+ * Copyright (c) 2016 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <mach/mach_types.h>
+#include <mach/machine.h>
+
+#include <machine/machine_routines.h>
+#include <machine/sched_param.h>
+#include <machine/machine_cpu.h>
+
+#include <kern/kern_types.h>
+#include <kern/debug.h>
+#include <kern/machine.h>
+#include <kern/misc_protos.h>
+#include <kern/processor.h>
+#include <kern/queue.h>
+#include <kern/sched.h>
+#include <kern/sched_prim.h>
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <kern/thread_group.h>
+#include <kern/sched_amp_common.h>
+
+#include <sys/kdebug.h>
+
+#if __AMP__
+
+static thread_t
+sched_amp_steal_thread(processor_set_t pset);
+
+static void
+sched_amp_thread_update_scan(sched_update_scan_context_t scan_context);
+
+static boolean_t
+sched_amp_processor_enqueue(processor_t processor, thread_t thread,
+    sched_options_t options);
+
+static boolean_t
+sched_amp_processor_queue_remove(processor_t processor, thread_t thread);
+
+static ast_t
+sched_amp_processor_csw_check(processor_t processor);
+
+static boolean_t
+sched_amp_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
+
+static int
+sched_amp_runq_count(processor_t processor);
+
+static boolean_t
+sched_amp_processor_queue_empty(processor_t processor);
+
+static uint64_t
+sched_amp_runq_stats_count_sum(processor_t processor);
+
+static int
+sched_amp_processor_bound_count(processor_t processor);
+
+static void
+sched_amp_pset_init(processor_set_t pset);
+
+static void
+sched_amp_processor_init(processor_t processor);
+
+static thread_t
+sched_amp_choose_thread(processor_t processor, int priority, ast_t reason);
+
+static void
+sched_amp_processor_queue_shutdown(processor_t processor);
+
+static sched_mode_t
+sched_amp_initial_thread_sched_mode(task_t parent_task);
+
+static processor_t
+sched_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
+
+static bool
+sched_amp_thread_avoid_processor(processor_t processor, thread_t thread);
+
+static bool
+sched_amp_thread_should_yield(processor_t processor, thread_t thread);
+
+static void
+sched_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
+
+const struct sched_dispatch_table sched_amp_dispatch = {
+       .sched_name                                     = "amp",
+       .init                                           = sched_amp_init,
+       .timebase_init                                  = sched_timeshare_timebase_init,
+       .processor_init                                 = sched_amp_processor_init,
+       .pset_init                                      = sched_amp_pset_init,
+       .maintenance_continuation                       = sched_timeshare_maintenance_continue,
+       .choose_thread                                  = sched_amp_choose_thread,
+       .steal_thread_enabled                           = sched_amp_steal_thread_enabled,
+       .steal_thread                                   = sched_amp_steal_thread,
+       .compute_timeshare_priority                     = sched_compute_timeshare_priority,
+       .choose_processor                               = sched_amp_choose_processor,
+       .processor_enqueue                              = sched_amp_processor_enqueue,
+       .processor_queue_shutdown                       = sched_amp_processor_queue_shutdown,
+       .processor_queue_remove                         = sched_amp_processor_queue_remove,
+       .processor_queue_empty                          = sched_amp_processor_queue_empty,
+       .priority_is_urgent                             = priority_is_urgent,
+       .processor_csw_check                            = sched_amp_processor_csw_check,
+       .processor_queue_has_priority                   = sched_amp_processor_queue_has_priority,
+       .initial_quantum_size                           = sched_timeshare_initial_quantum_size,
+       .initial_thread_sched_mode                      = sched_amp_initial_thread_sched_mode,
+       .can_update_priority                            = can_update_priority,
+       .update_priority                                = update_priority,
+       .lightweight_update_priority                    = lightweight_update_priority,
+       .quantum_expire                                 = sched_default_quantum_expire,
+       .processor_runq_count                           = sched_amp_runq_count,
+       .processor_runq_stats_count_sum                 = sched_amp_runq_stats_count_sum,
+       .processor_bound_count                          = sched_amp_processor_bound_count,
+       .thread_update_scan                             = sched_amp_thread_update_scan,
+       .multiple_psets_enabled                         = TRUE,
+       .sched_groups_enabled                           = FALSE,
+       .avoid_processor_enabled                        = TRUE,
+       .thread_avoid_processor                         = sched_amp_thread_avoid_processor,
+       .processor_balance                              = sched_amp_balance,
+
+       .rt_runq                                        = sched_amp_rt_runq,
+       .rt_init                                        = sched_amp_rt_init,
+       .rt_queue_shutdown                              = sched_amp_rt_queue_shutdown,
+       .rt_runq_scan                                   = sched_amp_rt_runq_scan,
+       .rt_runq_count_sum                              = sched_amp_rt_runq_count_sum,
+
+       .qos_max_parallelism                            = sched_amp_qos_max_parallelism,
+       .check_spill                                    = sched_amp_check_spill,
+       .ipi_policy                                     = sched_amp_ipi_policy,
+       .thread_should_yield                            = sched_amp_thread_should_yield,
+       .run_count_incr                                 = sched_run_incr,
+       .run_count_decr                                 = sched_run_decr,
+       .update_thread_bucket                           = sched_update_thread_bucket,
+       .pset_made_schedulable                          = sched_pset_made_schedulable,
+       .thread_group_recommendation_change             = sched_amp_thread_group_recommendation_change,
+};
+
+extern processor_set_t ecore_set;
+extern processor_set_t pcore_set;
+
+__attribute__((always_inline))
+static inline run_queue_t
+amp_main_runq(processor_t processor)
+{
+       return &processor->processor_set->pset_runq;
+}
+
+__attribute__((always_inline))
+static inline run_queue_t
+amp_bound_runq(processor_t processor)
+{
+       return &processor->runq;
+}
+
+__attribute__((always_inline))
+static inline run_queue_t
+amp_runq_for_thread(processor_t processor, thread_t thread)
+{
+       if (thread->bound_processor == PROCESSOR_NULL) {
+               return amp_main_runq(processor);
+       } else {
+               assert(thread->bound_processor == processor);
+               return amp_bound_runq(processor);
+       }
+}
+
+static sched_mode_t
+sched_amp_initial_thread_sched_mode(task_t parent_task)
+{
+       if (parent_task == kernel_task) {
+               return TH_MODE_FIXED;
+       } else {
+               return TH_MODE_TIMESHARE;
+       }
+}
+
+static void
+sched_amp_processor_init(processor_t processor)
+{
+       run_queue_init(&processor->runq);
+}
+
+static void
+sched_amp_pset_init(processor_set_t pset)
+{
+       run_queue_init(&pset->pset_runq);
+}
+
+static thread_t
+sched_amp_choose_thread(
+       processor_t      processor,
+       int              priority,
+       __unused ast_t            reason)
+{
+       processor_set_t pset = processor->processor_set;
+       bool spill_pending = false;
+       int spill_pri = -1;
+
+       if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
+               spill_pending = true;
+               spill_pri = pcore_set->pset_runq.highq;
+       }
+
+       run_queue_t main_runq  = amp_main_runq(processor);
+       run_queue_t bound_runq = amp_bound_runq(processor);
+       run_queue_t chosen_runq;
+
+       if ((bound_runq->highq < priority) &&
+           (main_runq->highq < priority) &&
+           (spill_pri < priority)) {
+               return THREAD_NULL;
+       }
+
+       if ((spill_pri > bound_runq->highq) &&
+           (spill_pri > main_runq->highq)) {
+               /*
+                * There is a higher priority thread on the P-core runq,
+                * so returning THREAD_NULL here will cause thread_select()
+                * to call sched_amp_steal_thread() to try to get it.
+                */
+               return THREAD_NULL;
+       }
+
+       if (bound_runq->highq >= main_runq->highq) {
+               chosen_runq = bound_runq;
+       } else {
+               chosen_runq = main_runq;
+       }
+
+       return run_queue_dequeue(chosen_runq, SCHED_HEADQ);
+}
+
+static boolean_t
+sched_amp_processor_enqueue(
+       processor_t       processor,
+       thread_t          thread,
+       sched_options_t   options)
+{
+       run_queue_t     rq = amp_runq_for_thread(processor, thread);
+       boolean_t       result;
+
+       result = run_queue_enqueue(rq, thread, options);
+       thread->runq = processor;
+
+       return result;
+}
+
+static boolean_t
+sched_amp_processor_queue_empty(processor_t processor)
+{
+       processor_set_t pset = processor->processor_set;
+       bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
+
+       return (amp_main_runq(processor)->count == 0) &&
+              (amp_bound_runq(processor)->count == 0) &&
+              !spill_pending;
+}
+
+static bool
+sched_amp_thread_should_yield(processor_t processor, thread_t thread)
+{
+       if (!sched_amp_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
+               return true;
+       }
+
+       if ((processor->processor_set->pset_cluster_type == PSET_AMP_E) && (recommended_pset_type(thread) == PSET_AMP_P)) {
+               return pcore_set->pset_runq.count > 0;
+       }
+
+       return false;
+}
+
+static ast_t
+sched_amp_processor_csw_check(processor_t processor)
+{
+       boolean_t       has_higher;
+       int             pri;
+
+       run_queue_t main_runq  = amp_main_runq(processor);
+       run_queue_t bound_runq = amp_bound_runq(processor);
+
+       assert(processor->active_thread != NULL);
+
+       processor_set_t pset = processor->processor_set;
+       bool spill_pending = false;
+       int spill_pri = -1;
+       int spill_urgency = 0;
+
+       if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
+               spill_pending = true;
+               spill_pri = pcore_set->pset_runq.highq;
+               spill_urgency = pcore_set->pset_runq.urgency;
+       }
+
+       pri = MAX(main_runq->highq, bound_runq->highq);
+       if (spill_pending) {
+               pri = MAX(pri, spill_pri);
+       }
+
+       if (processor->first_timeslice) {
+               has_higher = (pri > processor->current_pri);
+       } else {
+               has_higher = (pri >= processor->current_pri);
+       }
+
+       if (has_higher) {
+               if (main_runq->urgency > 0) {
+                       return AST_PREEMPT | AST_URGENT;
+               }
+
+               if (bound_runq->urgency > 0) {
+                       return AST_PREEMPT | AST_URGENT;
+               }
+
+               if (spill_urgency > 0) {
+                       return AST_PREEMPT | AST_URGENT;
+               }
+
+               return AST_PREEMPT;
+       }
+
+       return AST_NONE;
+}
+
+static boolean_t
+sched_amp_processor_queue_has_priority(processor_t    processor,
+    int            priority,
+    boolean_t      gte)
+{
+       bool spill_pending = false;
+       int spill_pri = -1;
+       processor_set_t pset = processor->processor_set;
+
+       if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
+               spill_pending = true;
+               spill_pri = pcore_set->pset_runq.highq;
+       }
+       run_queue_t main_runq  = amp_main_runq(processor);
+       run_queue_t bound_runq = amp_bound_runq(processor);
+
+       int qpri = MAX(main_runq->highq, bound_runq->highq);
+       if (spill_pending) {
+               qpri = MAX(qpri, spill_pri);
+       }
+
+       if (gte) {
+               return qpri >= priority;
+       } else {
+               return qpri > priority;
+       }
+}
+
+static int
+sched_amp_runq_count(processor_t processor)
+{
+       return amp_main_runq(processor)->count + amp_bound_runq(processor)->count;
+}
+
+static uint64_t
+sched_amp_runq_stats_count_sum(processor_t processor)
+{
+       uint64_t bound_sum = amp_bound_runq(processor)->runq_stats.count_sum;
+
+       if (processor->cpu_id == processor->processor_set->cpu_set_low) {
+               return bound_sum + amp_main_runq(processor)->runq_stats.count_sum;
+       } else {
+               return bound_sum;
+       }
+}
+static int
+sched_amp_processor_bound_count(processor_t processor)
+{
+       return amp_bound_runq(processor)->count;
+}
+
+static void
+sched_amp_processor_queue_shutdown(processor_t processor)
+{
+       processor_set_t pset = processor->processor_set;
+       run_queue_t     rq   = amp_main_runq(processor);
+       thread_t        thread;
+       queue_head_t    tqueue;
+
+       /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
+       if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
+               pset_unlock(pset);
+               return;
+       }
+
+       queue_init(&tqueue);
+
+       while (rq->count > 0) {
+               thread = run_queue_dequeue(rq, SCHED_HEADQ);
+               enqueue_tail(&tqueue, &thread->runq_links);
+       }
+
+       pset_unlock(pset);
+
+       qe_foreach_element_safe(thread, &tqueue, runq_links) {
+               remqueue(&thread->runq_links);
+
+               thread_lock(thread);
+
+               thread_setrun(thread, SCHED_TAILQ);
+
+               thread_unlock(thread);
+       }
+}
+
+static boolean_t
+sched_amp_processor_queue_remove(
+       processor_t processor,
+       thread_t    thread)
+{
+       run_queue_t             rq;
+       processor_set_t         pset = processor->processor_set;
+
+       pset_lock(pset);
+
+       rq = amp_runq_for_thread(processor, thread);
+
+       if (processor == thread->runq) {
+               /*
+                * Thread is on a run queue and we have a lock on
+                * that run queue.
+                */
+               run_queue_remove(rq, thread);
+       } else {
+               /*
+                * The thread left the run queue before we could
+                * lock the run queue.
+                */
+               assert(thread->runq == PROCESSOR_NULL);
+               processor = PROCESSOR_NULL;
+       }
+
+       pset_unlock(pset);
+
+       return processor != PROCESSOR_NULL;
+}
+
+/*
+ * sched_amp_steal_thread()
+ *
+ */
+thread_t
+sched_amp_steal_thread(processor_set_t pset)
+{
+       thread_t thread = THREAD_NULL;
+       processor_set_t nset = pset;
+
+       assert(pset->pset_cluster_type != PSET_AMP_P);
+
+       processor_t processor = current_processor();
+       assert(pset == processor->processor_set);
+
+       bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
+       bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
+
+       nset = pcore_set;
+
+       assert(nset != pset);
+
+       if (sched_get_pset_load_average(nset) >= sched_amp_steal_threshold(nset, spill_pending)) {
+               pset_unlock(pset);
+
+               pset = nset;
+
+               pset_lock(pset);
+
+               /* Allow steal if load average still OK, no idle cores, and more threads on runq than active cores DISPATCHING */
+               if ((sched_get_pset_load_average(pset) >= sched_amp_steal_threshold(pset, spill_pending)) &&
+                   (pset->pset_runq.count > bit_count(pset->cpu_state_map[PROCESSOR_DISPATCHING])) &&
+                   (bit_count(pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) == 0)) {
+                       thread = run_queue_dequeue(&pset->pset_runq, SCHED_HEADQ);
+                       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_STEAL) | DBG_FUNC_NONE, spill_pending, 0, 0, 0);
+                       sched_update_pset_load_average(pset);
+               }
+       }
+
+       pset_unlock(pset);
+       return thread;
+}
+
+
+
+static void
+sched_amp_thread_update_scan(sched_update_scan_context_t scan_context)
+{
+       boolean_t               restart_needed = FALSE;
+       processor_t             processor = processor_list;
+       processor_set_t         pset;
+       thread_t                thread;
+       spl_t                   s;
+
+       /*
+        *  We update the threads associated with each processor (bound and idle threads)
+        *  and then update the threads in each pset runqueue.
+        */
+
+       do {
+               do {
+                       pset = processor->processor_set;
+
+                       s = splsched();
+                       pset_lock(pset);
+
+                       restart_needed = runq_scan(amp_bound_runq(processor), scan_context);
+
+                       pset_unlock(pset);
+                       splx(s);
+
+                       if (restart_needed) {
+                               break;
+                       }
+
+                       thread = processor->idle_thread;
+                       if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
+                               if (thread_update_add_thread(thread) == FALSE) {
+                                       restart_needed = TRUE;
+                                       break;
+                               }
+                       }
+               } while ((processor = processor->processor_list) != NULL);
+
+               /* Ok, we now have a collection of candidates -- fix them. */
+               thread_update_process_threads();
+       } while (restart_needed);
+
+       pset_node_t node = &pset_node0;
+       pset = node->psets;
+
+       do {
+               do {
+                       restart_needed = FALSE;
+                       while (pset != NULL) {
+                               s = splsched();
+                               pset_lock(pset);
+
+                               restart_needed = runq_scan(&pset->pset_runq, scan_context);
+
+                               pset_unlock(pset);
+                               splx(s);
+
+                               if (restart_needed) {
+                                       break;
+                               }
+
+                               pset = pset->pset_list;
+                       }
+
+                       if (restart_needed) {
+                               break;
+                       }
+               } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
+
+               /* Ok, we now have a collection of candidates -- fix them. */
+               thread_update_process_threads();
+       } while (restart_needed);
+}
+
+static bool
+pcores_recommended(thread_t thread)
+{
+       if (pcore_set->online_processor_count == 0) {
+               /* No pcores available */
+               return false;
+       }
+
+       if (!pset_is_recommended(ecore_set)) {
+               /* No E cores recommended, must use P cores */
+               return true;
+       }
+
+       if (recommended_pset_type(thread) == PSET_AMP_E) {
+               return false;
+       }
+
+       return pset_is_recommended(pcore_set);
+}
+
+/* Return true if this thread should not continue running on this processor */
+static bool
+sched_amp_thread_avoid_processor(processor_t processor, thread_t thread)
+{
+       if (processor->processor_set->pset_cluster_type == PSET_AMP_E) {
+               if (pcores_recommended(thread)) {
+                       return true;
+               }
+       } else if (processor->processor_set->pset_cluster_type == PSET_AMP_P) {
+               if (!pcores_recommended(thread)) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static processor_t
+sched_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
+{
+       /* Bound threads don't call this function */
+       assert(thread->bound_processor == PROCESSOR_NULL);
+
+       processor_set_t nset = pset;
+       bool choose_pcores;
+
+again:
+       choose_pcores = pcores_recommended(thread);
+
+       if (choose_pcores && (pset->pset_cluster_type != PSET_AMP_P)) {
+               nset = pcore_set;
+               assert(nset != NULL);
+       } else if (!choose_pcores && (pset->pset_cluster_type != PSET_AMP_E)) {
+               nset = ecore_set;
+               assert(nset != NULL);
+       }
+
+       if (nset != pset) {
+               pset_unlock(pset);
+               pset_lock(nset);
+       }
+
+       /* Now that the chosen pset is definitely locked, make sure nothing important has changed */
+       if (!pset_is_recommended(nset)) {
+               pset = nset;
+               goto again;
+       }
+
+       return choose_processor(nset, processor, thread);
+}
+
+void
+sched_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation)
+{
+       thread_group_update_recommendation(tg, new_recommendation);
+
+       if (new_recommendation != CLUSTER_TYPE_P) {
+               return;
+       }
+
+       sched_amp_bounce_thread_group_from_ecores(ecore_set, tg);
+}
+
+#if DEVELOPMENT || DEBUG
+extern int32_t sysctl_get_bound_cpuid(void);
+int32_t
+sysctl_get_bound_cpuid(void)
+{
+       int32_t cpuid = -1;
+       thread_t self = current_thread();
+
+       processor_t processor = self->bound_processor;
+       if (processor == NULL) {
+               cpuid = -1;
+       } else {
+               cpuid = processor->cpu_id;
+       }
+
+       return cpuid;
+}
+
+extern void sysctl_thread_bind_cpuid(int32_t cpuid);
+void
+sysctl_thread_bind_cpuid(int32_t cpuid)
+{
+       if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
+               return;
+       }
+
+       processor_t processor = processor_array[cpuid];
+       if (processor == PROCESSOR_NULL) {
+               return;
+       }
+
+       thread_bind(processor);
+
+       thread_block(THREAD_CONTINUE_NULL);
+}
+
+extern char sysctl_get_bound_cluster_type(void);
+char
+sysctl_get_bound_cluster_type(void)
+{
+       thread_t self = current_thread();
+
+       if (self->sched_flags & TH_SFLAG_ECORE_ONLY) {
+               return 'E';
+       } else if (self->sched_flags & TH_SFLAG_PCORE_ONLY) {
+               return 'P';
+       }
+
+       return '0';
+}
+
+extern void sysctl_thread_bind_cluster_type(char cluster_type);
+void
+sysctl_thread_bind_cluster_type(char cluster_type)
+{
+       thread_bind_cluster_type(cluster_type);
+}
+
+extern char sysctl_get_task_cluster_type(void);
+char
+sysctl_get_task_cluster_type(void)
+{
+       thread_t thread = current_thread();
+       task_t task = thread->task;
+
+       if (task->pset_hint == ecore_set) {
+               return 'E';
+       } else if (task->pset_hint == pcore_set) {
+               return 'P';
+       }
+
+       return '0';
+}
+
+extern void sysctl_task_set_cluster_type(char cluster_type);
+void
+sysctl_task_set_cluster_type(char cluster_type)
+{
+       thread_t thread = current_thread();
+       task_t task = thread->task;
+
+       switch (cluster_type) {
+       case 'e':
+       case 'E':
+               task->pset_hint = ecore_set;
+               break;
+       case 'p':
+       case 'P':
+               task->pset_hint = pcore_set;
+               break;
+       default:
+               break;
+       }
+
+       thread_block(THREAD_CONTINUE_NULL);
+}
+#endif
+
+#endif
diff --git a/osfmk/kern/sched_amp_common.c b/osfmk/kern/sched_amp_common.c
new file mode 100644 (file)
index 0000000..1158090
--- /dev/null
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <mach/mach_types.h>
+#include <mach/machine.h>
+#include <machine/machine_routines.h>
+#include <machine/sched_param.h>
+#include <machine/machine_cpu.h>
+#include <kern/kern_types.h>
+#include <kern/debug.h>
+#include <kern/machine.h>
+#include <kern/misc_protos.h>
+#include <kern/processor.h>
+#include <kern/queue.h>
+#include <kern/sched.h>
+#include <kern/sched_prim.h>
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <machine/atomic.h>
+#include <sys/kdebug.h>
+#include <kern/sched_amp_common.h>
+
+#if __AMP__
+
+/* Exported globals */
+processor_set_t ecore_set = NULL;
+processor_set_t pcore_set = NULL;
+
+static struct processor_set pset1;
+static struct pset_node pset_node1;
+
+#if DEVELOPMENT || DEBUG
+bool system_ecore_only = false;
+#endif /* DEVELOPMENT || DEBUG */
+
+/*
+ * sched_amp_init()
+ *
+ * Initialize the pcore_set and ecore_set globals which describe the
+ * P/E processor sets.
+ */
+void
+sched_amp_init(void)
+{
+       pset_init(&pset1, &pset_node1);
+       pset_node1.psets = &pset1;
+       pset_node0.node_list = &pset_node1;
+
+       if (ml_get_boot_cluster() == CLUSTER_TYPE_P) {
+               pcore_set = &pset0;
+               ecore_set = &pset1;
+       } else {
+               ecore_set = &pset0;
+               pcore_set = &pset1;
+       }
+
+       ecore_set->pset_cluster_type = PSET_AMP_E;
+       ecore_set->pset_cluster_id = 0;
+
+       pcore_set->pset_cluster_type = PSET_AMP_P;
+       pcore_set->pset_cluster_id = 1;
+
+#if !CONFIG_SCHED_CLUTCH
+       /*
+        * For non-clutch scheduler, allow system to be e-core only.
+        * Clutch scheduler support for this feature needs to be implemented.
+        */
+#if DEVELOPMENT || DEBUG
+       if (PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
+               system_ecore_only = true;
+       }
+#endif /* DEVELOPMENT || DEBUG */
+
+#endif /* !CONFIG_SCHED_CLUTCH */
+       sched_timeshare_init();
+}
+
+/* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
+int sched_amp_spill_count = 3;
+int sched_amp_idle_steal = 1;
+int sched_amp_spill_steal = 1;
+
+/*
+ * We see performance gains from doing immediate IPIs to P-cores to run
+ * P-eligible threads and lesser P-E migrations from using deferred IPIs
+ * for spill.
+ */
+int sched_amp_spill_deferred_ipi = 1;
+int sched_amp_pcores_preempt_immediate_ipi = 1;
+
+
+/*
+ * sched_amp_spill_threshold()
+ *
+ * Routine to calulate spill threshold which decides if cluster should spill.
+ */
+int
+sched_amp_spill_threshold(processor_set_t pset)
+{
+       int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
+
+       return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count;
+}
+
+/*
+ * pset_signal_spill()
+ *
+ * Routine to signal a running/idle CPU to cause a spill onto that CPU.
+ * Called with pset locked, returns unlocked
+ */
+void
+pset_signal_spill(processor_set_t pset, int spilled_thread_priority)
+{
+       processor_t processor;
+       sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
+
+       uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE];
+       for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) {
+               processor = processor_array[cpuid];
+               if (bit_set_if_clear(pset->pending_spill_cpu_mask, processor->cpu_id)) {
+                       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0);
+
+                       processor->deadline = UINT64_MAX;
+                       pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
+
+                       if (processor == current_processor()) {
+                               bit_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
+                       } else {
+                               ipi_type = sched_ipi_action(processor, NULL, true, SCHED_IPI_EVENT_SPILL);
+                       }
+                       pset_unlock(pset);
+                       sched_ipi_perform(processor, ipi_type);
+                       return;
+               }
+       }
+
+       processor_t ast_processor = NULL;
+       uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING];
+       for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
+               processor = processor_array[cpuid];
+               if (processor->current_recommended_pset_type == PSET_AMP_P) {
+                       /* Already running a spilled P-core recommended thread */
+                       continue;
+               }
+               if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
+                       /* Already received a spill signal */
+                       continue;
+               }
+               if (processor->current_pri >= spilled_thread_priority) {
+                       /* Already running a higher or equal priority thread */
+                       continue;
+               }
+
+               /* Found a suitable processor */
+               bit_set(pset->pending_spill_cpu_mask, processor->cpu_id);
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0);
+               if (processor == current_processor()) {
+                       ast_on(AST_PREEMPT);
+               }
+               ipi_type = sched_ipi_action(processor, NULL, false, SCHED_IPI_EVENT_SPILL);
+               if (ipi_type != SCHED_IPI_NONE) {
+                       ast_processor = processor;
+               }
+               break;
+       }
+
+       pset_unlock(pset);
+       sched_ipi_perform(ast_processor, ipi_type);
+}
+
+/*
+ * pset_should_accept_spilled_thread()
+ *
+ * Routine to decide if pset should accept spilled threads.
+ * This function must be safe to call (to use as a hint) without holding the pset lock.
+ */
+bool
+pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority)
+{
+       if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
+               return true;
+       }
+
+       uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]);
+
+       for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
+               processor_t processor = processor_array[cpuid];
+
+               if (processor->current_recommended_pset_type == PSET_AMP_P) {
+                       /* This processor is already running a spilled thread */
+                       continue;
+               }
+
+               if (processor->current_pri < spilled_thread_priority) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+/*
+ * should_spill_to_ecores()
+ *
+ * Spill policy is implemented here
+ */
+bool
+should_spill_to_ecores(processor_set_t nset, thread_t thread)
+{
+       if (nset->pset_cluster_type == PSET_AMP_E) {
+               /* Not relevant if ecores already preferred */
+               return false;
+       }
+
+       if (!pset_is_recommended(ecore_set)) {
+               /* E cores must be recommended */
+               return false;
+       }
+
+#if !CONFIG_SCHED_CLUTCH
+       /* Per-thread P-core scheduling support needs to be implemented for clutch scheduler */
+       if (thread->sched_flags & TH_SFLAG_PCORE_ONLY) {
+               return false;
+       }
+#endif /* !CONFIG_SCHED_CLUTCH */
+
+       if (thread->sched_pri >= BASEPRI_RTQUEUES) {
+               /* Never spill realtime threads */
+               return false;
+       }
+
+       if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
+               /* Don't spill if idle cores */
+               return false;
+       }
+
+       if ((sched_get_pset_load_average(nset) >= sched_amp_spill_threshold(nset)) &&  /* There is already a load on P cores */
+           pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */
+               return true;
+       }
+
+       return false;
+}
+
+/*
+ * sched_amp_check_spill()
+ *
+ * Routine to check if the thread should be spilled and signal the pset if needed.
+ */
+void
+sched_amp_check_spill(processor_set_t pset, thread_t thread)
+{
+       /* pset is unlocked */
+
+       /* Bound threads don't call this function */
+       assert(thread->bound_processor == PROCESSOR_NULL);
+
+       if (should_spill_to_ecores(pset, thread)) {
+               pset_lock(ecore_set);
+
+               pset_signal_spill(ecore_set, thread->sched_pri);
+               /* returns with ecore_set unlocked */
+       }
+}
+
+/*
+ * sched_amp_steal_threshold()
+ *
+ * Routine to calculate the steal threshold
+ */
+int
+sched_amp_steal_threshold(processor_set_t pset, bool spill_pending)
+{
+       int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
+
+       return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal);
+}
+
+/*
+ * sched_amp_steal_thread_enabled()
+ *
+ */
+bool
+sched_amp_steal_thread_enabled(processor_set_t pset)
+{
+       return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set->online_processor_count > 0);
+}
+
+/*
+ * sched_amp_balance()
+ *
+ * Invoked with pset locked, returns with pset unlocked
+ */
+void
+sched_amp_balance(processor_t cprocessor, processor_set_t cpset)
+{
+       assert(cprocessor == current_processor());
+
+       pset_unlock(cpset);
+
+       if (cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) {
+               return;
+       }
+
+       /*
+        * cprocessor is an idle, recommended P core processor.
+        * Look for P-eligible threads that have spilled to an E core
+        * and coax them to come back.
+        */
+
+       processor_set_t pset = ecore_set;
+
+       pset_lock(pset);
+
+       processor_t eprocessor;
+       uint64_t ast_processor_map = 0;
+
+       sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
+       uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
+       for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
+               eprocessor = processor_array[cpuid];
+               if ((eprocessor->current_pri < BASEPRI_RTQUEUES) &&
+                   (eprocessor->current_recommended_pset_type == PSET_AMP_P)) {
+                       ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
+                       if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
+                               bit_set(ast_processor_map, eprocessor->cpu_id);
+                               assert(eprocessor != cprocessor);
+                       }
+               }
+       }
+
+       pset_unlock(pset);
+
+       for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
+               processor_t ast_processor = processor_array[cpuid];
+               sched_ipi_perform(ast_processor, ipi_type[cpuid]);
+       }
+}
+
+/*
+ * Helper function for sched_amp_thread_group_recommendation_change()
+ * Find all the cores in the pset running threads from the thread_group tg
+ * and send them a rebalance interrupt.
+ */
+void
+sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg)
+{
+       assert(pset->pset_cluster_type == PSET_AMP_E);
+       uint64_t ast_processor_map = 0;
+       sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
+
+       spl_t s = splsched();
+       pset_lock(pset);
+
+       uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
+       for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
+               processor_t eprocessor = processor_array[cpuid];
+               if (eprocessor->current_thread_group == tg) {
+                       ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
+                       if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
+                               bit_set(ast_processor_map, eprocessor->cpu_id);
+                       } else if (eprocessor == current_processor()) {
+                               ast_on(AST_PREEMPT);
+                               bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id);
+                       }
+               }
+       }
+
+       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0);
+
+       pset_unlock(pset);
+
+       for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
+               processor_t ast_processor = processor_array[cpuid];
+               sched_ipi_perform(ast_processor, ipi_type[cpuid]);
+       }
+
+       splx(s);
+}
+
+/*
+ * sched_amp_ipi_policy()
+ */
+sched_ipi_type_t
+sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
+{
+       processor_set_t pset = dst->processor_set;
+       assert(bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id) == false);
+       assert(dst != current_processor());
+
+       boolean_t deferred_ipi_supported = false;
+#if defined(CONFIG_SCHED_DEFERRED_AST)
+       deferred_ipi_supported = true;
+#endif /* CONFIG_SCHED_DEFERRED_AST */
+
+       switch (event) {
+       case SCHED_IPI_EVENT_SPILL:
+               /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
+               if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
+                       return sched_ipi_deferred_policy(pset, dst, event);
+               }
+               break;
+       case SCHED_IPI_EVENT_PREEMPT:
+               /* For preemption, the default policy is to use deferred IPIs
+                * for Non-RT P-core preemption. Override that behavior if
+                * sched_amp_pcores_preempt_immediate_ipi is set
+                */
+               if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
+                       if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) {
+                               return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
+                       }
+               }
+               break;
+       default:
+               break;
+       }
+       /* Default back to the global policy for all other scenarios */
+       return sched_ipi_policy(dst, thread, dst_idle, event);
+}
+
+/*
+ * sched_amp_qos_max_parallelism()
+ */
+uint32_t
+sched_amp_qos_max_parallelism(int qos, uint64_t options)
+{
+       uint32_t ecount = ecore_set->cpu_set_count;
+       uint32_t pcount = pcore_set->cpu_set_count;
+
+       if (options & QOS_PARALLELISM_REALTIME) {
+               /* For realtime threads on AMP, we would want them
+                * to limit the width to just the P-cores since we
+                * do not spill/rebalance for RT threads.
+                */
+               return pcount;
+       }
+
+       /*
+        * The current AMP scheduler policy is not run
+        * background and utility threads on the P-Cores.
+        */
+       switch (qos) {
+       case THREAD_QOS_UTILITY:
+       case THREAD_QOS_BACKGROUND:
+       case THREAD_QOS_MAINTENANCE:
+               return ecount;
+       default:
+               return ecount + pcount;
+       }
+}
+
+/*
+ * sched_amp_rt_runq()
+ */
+rt_queue_t
+sched_amp_rt_runq(processor_set_t pset)
+{
+       return &pset->rt_runq;
+}
+
+/*
+ * sched_amp_rt_init()
+ */
+void
+sched_amp_rt_init(processor_set_t pset)
+{
+       pset_rt_init(pset);
+}
+
+/*
+ * sched_amp_rt_queue_shutdown()
+ */
+void
+sched_amp_rt_queue_shutdown(processor_t processor)
+{
+       processor_set_t pset = processor->processor_set;
+       thread_t        thread;
+       queue_head_t    tqueue;
+
+       pset_lock(pset);
+
+       /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
+       if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
+               pset_unlock(pset);
+               return;
+       }
+
+       queue_init(&tqueue);
+
+       rt_lock_lock(pset);
+
+       while (rt_runq_count(pset) > 0) {
+               thread = qe_dequeue_head(&pset->rt_runq.queue, struct thread, runq_links);
+               thread->runq = PROCESSOR_NULL;
+               SCHED_STATS_RUNQ_CHANGE(&pset->rt_runq.runq_stats, pset->rt_runq.count);
+               rt_runq_count_decr(pset);
+               enqueue_tail(&tqueue, &thread->runq_links);
+       }
+       rt_lock_unlock(pset);
+       sched_update_pset_load_average(pset);
+       pset_unlock(pset);
+
+       qe_foreach_element_safe(thread, &tqueue, runq_links) {
+               remqueue(&thread->runq_links);
+
+               thread_lock(thread);
+
+               thread_setrun(thread, SCHED_TAILQ);
+
+               thread_unlock(thread);
+       }
+}
+
+/*
+ * sched_amp_rt_runq_scan()
+ *
+ * Assumes RT lock is not held, and acquires splsched/rt_lock itself
+ */
+void
+sched_amp_rt_runq_scan(sched_update_scan_context_t scan_context)
+{
+       thread_t        thread;
+
+       pset_node_t node = &pset_node0;
+       processor_set_t pset = node->psets;
+
+       spl_t s = splsched();
+       do {
+               while (pset != NULL) {
+                       rt_lock_lock(pset);
+
+                       qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) {
+                               if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
+                                       scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
+                               }
+                       }
+
+                       rt_lock_unlock(pset);
+
+                       pset = pset->pset_list;
+               }
+       } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
+       splx(s);
+}
+
+/*
+ * sched_amp_rt_runq_count_sum()
+ */
+int64_t
+sched_amp_rt_runq_count_sum(void)
+{
+       pset_node_t node = &pset_node0;
+       processor_set_t pset = node->psets;
+       int64_t count = 0;
+
+       do {
+               while (pset != NULL) {
+                       count += pset->rt_runq.runq_stats.count_sum;
+
+                       pset = pset->pset_list;
+               }
+       } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
+
+       return count;
+}
+
+#endif /* __AMP__ */
diff --git a/osfmk/kern/sched_amp_common.h b/osfmk/kern/sched_amp_common.h
new file mode 100644 (file)
index 0000000..e29cf07
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_SCHED_AMP_COMMON_H_
+#define _KERN_SCHED_AMP_COMMON_H_
+
+#if __AMP__
+
+/* Routine to initialize processor sets on AMP platforms */
+void sched_amp_init(void);
+
+/*
+ * The AMP scheduler uses spill/steal/rebalance logic to make sure the most appropriate threads
+ * are scheduled on the P/E clusters. Here are the definitions of those terms:
+ *
+ * - Spill:     Spill threads from an overcommited P-cluster onto the E-cluster. This is needed to make sure
+ *              that high priority P-recommended threads experience low scheduling latency in the presence of
+ *              lots of P-recommended threads.
+ *
+ * - Steal:     From an E-core, steal a thread from the P-cluster to provide low scheduling latency for
+ *              P-recommended threads.
+ *
+ * - Rebalance: Once a P-core goes idle, check if the E-cores are running any P-recommended threads and
+ *              bring it back to run on its recommended cluster type.
+ */
+
+/* Spill logic */
+int sched_amp_spill_threshold(processor_set_t pset);
+void pset_signal_spill(processor_set_t pset, int spilled_thread_priority);
+bool pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority);
+bool should_spill_to_ecores(processor_set_t nset, thread_t thread);
+void sched_amp_check_spill(processor_set_t pset, thread_t thread);
+
+/* Steal logic */
+int sched_amp_steal_threshold(processor_set_t pset, bool spill_pending);
+bool sched_amp_steal_thread_enabled(processor_set_t pset);
+
+/* Rebalance logic */
+void sched_amp_balance(processor_t cprocessor, processor_set_t cpset);
+
+/* IPI policy */
+sched_ipi_type_t sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
+
+/* AMP realtime runq management */
+rt_queue_t sched_amp_rt_runq(processor_set_t pset);
+void sched_amp_rt_init(processor_set_t pset);
+void sched_amp_rt_queue_shutdown(processor_t processor);
+void sched_amp_rt_runq_scan(sched_update_scan_context_t scan_context);
+int64_t sched_amp_rt_runq_count_sum(void);
+
+uint32_t sched_amp_qos_max_parallelism(int qos, uint64_t options);
+void sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg);
+
+#endif /* __AMP__ */
+
+#endif /* _KERN_SCHED_AMP_COMMON_H_ */
index 7a246a05e6f62979105679a437ab634ce060206a..d8a808f60166631f31fefd01b30d935be027b894 100644 (file)
@@ -46,6 +46,9 @@
 #include <kern/sched_clutch.h>
 #include <sys/kdebug.h>
 
+#if __AMP__
+#include <kern/sched_amp_common.h>
+#endif /* __AMP__ */
 
 #if CONFIG_SCHED_CLUTCH
 
@@ -92,6 +95,10 @@ static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
 static int sched_clutch_root_priority(sched_clutch_root_t);
 
+#if __AMP__
+/* System based routines */
+static bool sched_clutch_pset_available(processor_set_t);
+#endif /* __AMP__ */
 
 /* Helper debugging routines */
 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
@@ -250,6 +257,30 @@ sched_clutch_thr_count_dec(
        }
 }
 
+#if __AMP__
+
+/*
+ * sched_clutch_pset_available()
+ *
+ * Routine to determine if a pset is available for scheduling.
+ */
+static bool
+sched_clutch_pset_available(processor_set_t pset)
+{
+       /* Check if cluster has none of the CPUs available */
+       if (pset->online_processor_count == 0) {
+               return false;
+       }
+
+       /* Check if the cluster is not recommended by CLPC */
+       if (!pset_is_recommended(pset)) {
+               return false;
+       }
+
+       return true;
+}
+
+#endif /* __AMP__ */
 
 /*
  * sched_clutch_root_init()
@@ -748,6 +779,34 @@ sched_clutch_destroy(
        assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
 }
 
+#if __AMP__
+
+/*
+ * sched_clutch_bucket_foreign()
+ *
+ * Identifies if the clutch bucket is a foreign (not recommended for) this
+ * hierarchy. This is possible due to the recommended hierarchy/pset not
+ * available for scheduling currently.
+ */
+static boolean_t
+sched_clutch_bucket_foreign(sched_clutch_root_t root_clutch, sched_clutch_bucket_t clutch_bucket)
+{
+       assert(clutch_bucket->scb_thr_count > 0);
+       if (!sched_clutch_pset_available(root_clutch->scr_pset)) {
+               /* Even though the pset was not available for scheduling, threads
+                * are being put in its runq (this might be due to the other pset
+                * being turned off and this being the master processor pset).
+                * Mark the clutch bucket as foreign so that when the other
+                * pset becomes available, it moves the clutch bucket accordingly.
+                */
+               return true;
+       }
+       thread_t thread = run_queue_peek(&clutch_bucket->scb_runq);
+       pset_cluster_type_t pset_type = recommended_pset_type(thread);
+       return pset_type != root_clutch->scr_pset->pset_cluster_type;
+}
+
+#endif /* __AMP__ */
 
 /*
  * sched_clutch_bucket_hierarchy_insert()
@@ -766,6 +825,13 @@ sched_clutch_bucket_hierarchy_insert(
                /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
                enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
        }
+#if __AMP__
+       /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
+       if (sched_clutch_bucket_foreign(root_clutch, clutch_bucket)) {
+               clutch_bucket->scb_foreign = true;
+               enqueue_tail(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
+       }
+#endif /* __AMP__ */
        sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
 
        /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
@@ -797,6 +863,12 @@ sched_clutch_bucket_hierarchy_remove(
                /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
                remqueue(&clutch_bucket->scb_listlink);
        }
+#if __AMP__
+       if (clutch_bucket->scb_foreign) {
+               clutch_bucket->scb_foreign = false;
+               remqueue(&clutch_bucket->scb_foreignlink);
+       }
+#endif /* __AMP__ */
 
        sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket];
 
@@ -2170,5 +2242,655 @@ sched_clutch_update_thread_bucket(thread_t thread)
        }
 }
 
+#if __AMP__
+
+/* Implementation of the AMP version of the clutch scheduler */
+
+static thread_t
+sched_clutch_amp_steal_thread(processor_set_t pset);
+
+static ast_t
+sched_clutch_amp_processor_csw_check(processor_t processor);
+
+static boolean_t
+sched_clutch_amp_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
+
+static boolean_t
+sched_clutch_amp_processor_queue_empty(processor_t processor);
+
+static thread_t
+sched_clutch_amp_choose_thread(processor_t processor, int priority, ast_t reason);
+
+static void
+sched_clutch_amp_processor_queue_shutdown(processor_t processor);
+
+static processor_t
+sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
+
+static bool
+sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread);
+
+static bool
+sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread);
+
+static void
+sched_clutch_migrate_foreign_buckets(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
+
+static void
+sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
+
+const struct sched_dispatch_table sched_clutch_amp_dispatch = {
+       .sched_name                                     = "clutch_amp",
+       .init                                           = sched_amp_init,
+       .timebase_init                                  = sched_clutch_timebase_init,
+       .processor_init                                 = sched_clutch_processor_init,
+       .pset_init                                      = sched_clutch_pset_init,
+       .maintenance_continuation                       = sched_timeshare_maintenance_continue,
+       .choose_thread                                  = sched_clutch_amp_choose_thread,
+       .steal_thread_enabled                           = sched_amp_steal_thread_enabled,
+       .steal_thread                                   = sched_clutch_amp_steal_thread,
+       .compute_timeshare_priority                     = sched_compute_timeshare_priority,
+       .choose_processor                               = sched_clutch_amp_choose_processor,
+       .processor_enqueue                              = sched_clutch_processor_enqueue,
+       .processor_queue_shutdown                       = sched_clutch_amp_processor_queue_shutdown,
+       .processor_queue_remove                         = sched_clutch_processor_queue_remove,
+       .processor_queue_empty                          = sched_clutch_amp_processor_queue_empty,
+       .priority_is_urgent                             = priority_is_urgent,
+       .processor_csw_check                            = sched_clutch_amp_processor_csw_check,
+       .processor_queue_has_priority                   = sched_clutch_amp_processor_queue_has_priority,
+       .initial_quantum_size                           = sched_clutch_initial_quantum_size,
+       .initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
+       .can_update_priority                            = can_update_priority,
+       .update_priority                                = update_priority,
+       .lightweight_update_priority                    = lightweight_update_priority,
+       .quantum_expire                                 = sched_default_quantum_expire,
+       .processor_runq_count                           = sched_clutch_runq_count,
+       .processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
+       .processor_bound_count                          = sched_clutch_processor_bound_count,
+       .thread_update_scan                             = sched_clutch_thread_update_scan,
+       .multiple_psets_enabled                         = TRUE,
+       .sched_groups_enabled                           = FALSE,
+       .avoid_processor_enabled                        = TRUE,
+       .thread_avoid_processor                         = sched_clutch_amp_thread_avoid_processor,
+       .processor_balance                              = sched_amp_balance,
+
+       .rt_runq                                        = sched_amp_rt_runq,
+       .rt_init                                        = sched_amp_rt_init,
+       .rt_queue_shutdown                              = sched_amp_rt_queue_shutdown,
+       .rt_runq_scan                                   = sched_amp_rt_runq_scan,
+       .rt_runq_count_sum                              = sched_amp_rt_runq_count_sum,
+
+       .qos_max_parallelism                            = sched_amp_qos_max_parallelism,
+       .check_spill                                    = sched_amp_check_spill,
+       .ipi_policy                                     = sched_amp_ipi_policy,
+       .thread_should_yield                            = sched_clutch_amp_thread_should_yield,
+       .run_count_incr                                 = sched_clutch_run_incr,
+       .run_count_decr                                 = sched_clutch_run_decr,
+       .update_thread_bucket                           = sched_clutch_update_thread_bucket,
+       .pset_made_schedulable                          = sched_clutch_migrate_foreign_buckets,
+       .thread_group_recommendation_change             = sched_clutch_amp_thread_group_recommendation_change,
+};
+
+extern processor_set_t ecore_set;
+extern processor_set_t pcore_set;
+
+static thread_t
+sched_clutch_amp_choose_thread(
+       processor_t      processor,
+       int              priority,
+       __unused ast_t            reason)
+{
+       processor_set_t pset = processor->processor_set;
+       bool spill_pending = false;
+       int spill_pri = -1;
+
+       if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
+               spill_pending = true;
+               spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
+       }
+
+       int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
+       run_queue_t bound_runq = sched_clutch_bound_runq(processor);
+       boolean_t choose_from_boundq = false;
+
+       if ((bound_runq->highq < priority) &&
+           (clutch_pri < priority) &&
+           (spill_pri < priority)) {
+               return THREAD_NULL;
+       }
+
+       if ((spill_pri > bound_runq->highq) &&
+           (spill_pri > clutch_pri)) {
+               /*
+                * There is a higher priority thread on the P-core runq,
+                * so returning THREAD_NULL here will cause thread_select()
+                * to call sched_clutch_amp_steal_thread() to try to get it.
+                */
+               return THREAD_NULL;
+       }
+
+       if (bound_runq->highq >= clutch_pri) {
+               choose_from_boundq = true;
+       }
+
+       thread_t thread = THREAD_NULL;
+       if (choose_from_boundq == false) {
+               sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
+               thread = sched_clutch_thread_highest(pset_clutch_root);
+       } else {
+               thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
+       }
+       return thread;
+}
+
+static boolean_t
+sched_clutch_amp_processor_queue_empty(processor_t processor)
+{
+       processor_set_t pset = processor->processor_set;
+       bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
+
+       return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
+              (sched_clutch_bound_runq(processor)->count == 0) &&
+              !spill_pending;
+}
+
+static bool
+sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread)
+{
+       if (!sched_clutch_amp_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
+               return true;
+       }
+
+       if ((processor->processor_set->pset_cluster_type == PSET_AMP_E) && (recommended_pset_type(thread) == PSET_AMP_P)) {
+               return sched_clutch_root_count(&pcore_set->pset_clutch_root) > 0;
+       }
+
+       return false;
+}
+
+static ast_t
+sched_clutch_amp_processor_csw_check(processor_t processor)
+{
+       boolean_t       has_higher;
+       int             pri;
+
+       int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
+       run_queue_t bound_runq = sched_clutch_bound_runq(processor);
+
+       assert(processor->active_thread != NULL);
+
+       processor_set_t pset = processor->processor_set;
+       bool spill_pending = false;
+       int spill_pri = -1;
+       int spill_urgency = 0;
+
+       if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
+               spill_pending = true;
+               spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
+               spill_urgency = sched_clutch_root_urgency(&pcore_set->pset_clutch_root);
+       }
+
+       pri = MAX(clutch_pri, bound_runq->highq);
+       if (spill_pending) {
+               pri = MAX(pri, spill_pri);
+       }
+
+       if (processor->first_timeslice) {
+               has_higher = (pri > processor->current_pri);
+       } else {
+               has_higher = (pri >= processor->current_pri);
+       }
+
+       if (has_higher) {
+               if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
+                       return AST_PREEMPT | AST_URGENT;
+               }
+
+               if (bound_runq->urgency > 0) {
+                       return AST_PREEMPT | AST_URGENT;
+               }
+
+               if (spill_urgency > 0) {
+                       return AST_PREEMPT | AST_URGENT;
+               }
+
+               return AST_PREEMPT;
+       }
+
+       return AST_NONE;
+}
+
+static boolean_t
+sched_clutch_amp_processor_queue_has_priority(processor_t    processor,
+    int            priority,
+    boolean_t      gte)
+{
+       bool spill_pending = false;
+       int spill_pri = -1;
+       processor_set_t pset = processor->processor_set;
+
+       if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
+               spill_pending = true;
+               spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root);
+       }
+       run_queue_t bound_runq = sched_clutch_bound_runq(processor);
+
+       int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
+       if (spill_pending) {
+               qpri = MAX(qpri, spill_pri);
+       }
+
+       if (gte) {
+               return qpri >= priority;
+       } else {
+               return qpri > priority;
+       }
+}
+
+/*
+ * sched_clutch_hierarchy_thread_pset()
+ *
+ * Routine to determine where a thread should be enqueued based on its
+ * recommendation if this is the first runnable thread in the clutch_bucket
+ * or its clutch bucket's hierarchy membership.
+ */
+static processor_set_t
+sched_clutch_hierarchy_thread_pset(thread_t thread)
+{
+       if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread) == false) {
+               return (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_set : ecore_set;
+       }
+
+       sched_clutch_t clutch = sched_clutch_for_thread(thread);
+       sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]);
+       sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
+       if (scb_root) {
+               /* Clutch bucket is already runnable, return the pset hierarchy its part of */
+               return scb_root->scr_pset;
+       }
+       return (recommended_pset_type(thread) == PSET_AMP_E) ? ecore_set : pcore_set;
+}
+
+/*
+ * sched_clutch_thread_pset_recommended()
+ *
+ * Routine to determine if the thread should be placed on the provided pset.
+ * The routine first makes sure the cluster is available for scheduling. If
+ * it is available, it looks at the thread's recommendation. Called
+ * with the pset lock held.
+ */
+static bool
+sched_clutch_thread_pset_recommended(thread_t thread, processor_set_t pset)
+{
+       if (!sched_clutch_pset_available(pset)) {
+               return false;
+       }
+
+       /* At this point, all clusters should be available and recommended */
+       if (sched_clutch_hierarchy_thread_pset(thread) != pset) {
+               return false;
+       }
+
+       return true;
+}
+
+
+static void
+sched_clutch_amp_processor_queue_shutdown(processor_t processor)
+{
+       processor_set_t pset = processor->processor_set;
+       sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
+       thread_t        thread;
+       queue_head_t    tqueue;
+
+       /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
+       if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
+               pset_unlock(pset);
+               return;
+       }
+
+       queue_init(&tqueue);
+       while (sched_clutch_root_count(pset_clutch_root) > 0) {
+               thread = sched_clutch_thread_highest(pset_clutch_root);
+               enqueue_tail(&tqueue, &thread->runq_links);
+       }
+       pset_unlock(pset);
+
+       qe_foreach_element_safe(thread, &tqueue, runq_links) {
+               remqueue(&thread->runq_links);
+               thread_lock(thread);
+               thread_setrun(thread, SCHED_TAILQ);
+               thread_unlock(thread);
+       }
+}
+
+static thread_t
+sched_clutch_amp_steal_thread(processor_set_t pset)
+{
+       thread_t thread = THREAD_NULL;
+       processor_set_t nset = pset;
+
+       if (pcore_set->online_processor_count == 0) {
+               /* Nothing to steal from */
+               goto out;
+       }
+
+       if (pset->pset_cluster_type == PSET_AMP_P) {
+               /* P cores don't steal from E cores */
+               goto out;
+       }
+
+       processor_t processor = current_processor();
+       assert(pset == processor->processor_set);
+
+       bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id);
+       bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
+
+       nset = pcore_set;
+
+       assert(nset != pset);
+
+       if (sched_get_pset_load_average(nset) >= sched_amp_steal_threshold(nset, spill_pending)) {
+               pset_unlock(pset);
+
+               pset = nset;
+
+               pset_lock(pset);
+
+               /* Allow steal if load average still OK, no idle cores, and more threads on runq than active cores DISPATCHING */
+               if ((sched_get_pset_load_average(pset) >= sched_amp_steal_threshold(pset, spill_pending)) &&
+                   ((int)sched_clutch_root_count(&pset->pset_clutch_root) > bit_count(pset->cpu_state_map[PROCESSOR_DISPATCHING])) &&
+                   (bit_count(pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) == 0)) {
+                       thread = sched_clutch_thread_highest(&pset->pset_clutch_root);
+                       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_STEAL) | DBG_FUNC_NONE, spill_pending, 0, 0, 0);
+                       sched_update_pset_load_average(pset);
+               }
+       }
+
+out:
+       pset_unlock(pset);
+       return thread;
+}
+
+/* Return true if this thread should not continue running on this processor */
+static bool
+sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread)
+{
+       if (processor->processor_set->pset_cluster_type == PSET_AMP_E) {
+               if (sched_clutch_thread_pset_recommended(thread, pcore_set)) {
+                       return true;
+               }
+       } else if (processor->processor_set->pset_cluster_type == PSET_AMP_P) {
+               if (!sched_clutch_thread_pset_recommended(thread, pcore_set)) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static processor_t
+sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
+{
+       /* Bound threads don't call this function */
+       assert(thread->bound_processor == PROCESSOR_NULL);
+
+       processor_set_t nset;
+       processor_t chosen_processor = PROCESSOR_NULL;
+
+select_pset:
+       nset = (pset == ecore_set) ? pcore_set : ecore_set;
+       if (!sched_clutch_pset_available(pset)) {
+               /* If the current pset is not available for scheduling, just use the other pset */
+               pset_unlock(pset);
+               pset_lock(nset);
+               goto select_processor;
+       }
+
+       /* Check if the thread is recommended to run on this pset */
+       if (sched_clutch_thread_pset_recommended(thread, pset)) {
+               nset = pset;
+               goto select_processor;
+       } else {
+               /* pset not recommended; try the other pset */
+               pset_unlock(pset);
+               pset_lock(nset);
+               pset = nset;
+               goto select_pset;
+       }
+
+select_processor:
+       if (!sched_clutch_pset_available(nset)) {
+               /*
+                * It looks like both psets are not available due to some
+                * reason. In that case, just use the master processor's
+                * pset for scheduling.
+                */
+               if (master_processor->processor_set != nset) {
+                       pset_unlock(nset);
+                       nset = master_processor->processor_set;
+                       pset_lock(nset);
+               }
+       }
+       chosen_processor = choose_processor(nset, processor, thread);
+       assert(chosen_processor->processor_set == nset);
+       return chosen_processor;
+}
+
+/*
+ * AMP Clutch Scheduler Thread Migration
+ *
+ * For the AMP version of the clutch scheduler the thread is always scheduled via its
+ * thread group. So it is important to make sure that the thread group is part of the
+ * correct processor set hierarchy. In order to do that, the clutch scheduler moves
+ * all eligble clutch buckets to the correct hierarchy when the recommendation of a
+ * thread group is changed by CLPC.
+ */
+
+/*
+ * sched_clutch_recommended_pset()
+ *
+ * Routine to decide which hierarchy the thread group should be in based on the
+ * recommendation and other thread group and system properties. This routine is
+ * used to determine if thread group migration is necessary and should mimic the
+ * logic in sched_clutch_thread_pset_recommended() & recommended_pset_type().
+ */
+static processor_set_t
+sched_clutch_recommended_pset(sched_clutch_t sched_clutch, cluster_type_t recommendation)
+{
+       if (!sched_clutch_pset_available(pcore_set)) {
+               return ecore_set;
+       }
+
+       if (!sched_clutch_pset_available(ecore_set)) {
+               return pcore_set;
+       }
+
+       /*
+        * If all clusters are available and recommended, use the recommendation
+        * to decide which cluster to use.
+        */
+       pset_cluster_type_t type = thread_group_pset_recommendation(sched_clutch->sc_tg, recommendation);
+       return (type == PSET_AMP_E) ? ecore_set : pcore_set;
+}
+
+static void
+sched_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
+{
+       uint16_t thread_count = clutch_bucket->scb_thr_count;
+       thread_t thread;
+       uint64_t current_timestamp = mach_approximate_time();
+       while (thread_count > 0) {
+               thread = run_queue_peek(&clutch_bucket->scb_runq);
+               sched_clutch_thread_remove(root_clutch, thread, current_timestamp);
+               enqueue_tail(clutch_threads, &thread->runq_links);
+               thread_count--;
+       }
+
+       /*
+        * This operation should have drained the clutch bucket and pulled it out of the
+        * hierarchy.
+        */
+       assert(clutch_bucket->scb_thr_count == 0);
+       assert(clutch_bucket->scb_root == NULL);
+}
+
+/*
+ * sched_clutch_migrate_thread_group()
+ *
+ * Routine to implement the migration of threads when the thread group
+ * recommendation is updated. The migration works using a 2-phase
+ * algorithm.
+ *
+ * Phase 1: With the source pset (determined by sched_clutch_recommended_pset)
+ * locked, drain all the runnable threads into a local queue and update the TG
+ * recommendation.
+ *
+ * Phase 2: Call thread_setrun() on all the drained threads. Since the TG recommendation
+ * has been updated, these should all end up in the right hierarchy.
+ */
+static void
+sched_clutch_migrate_thread_group(sched_clutch_t sched_clutch, cluster_type_t new_recommendation)
+{
+       thread_t thread;
+
+       /* If the thread group is empty, just update the recommendation */
+       if (os_atomic_load(&sched_clutch->sc_thr_count, relaxed) == 0) {
+               thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation);
+               return;
+       }
+
+       processor_set_t dst_pset = sched_clutch_recommended_pset(sched_clutch, new_recommendation);
+       processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set;
+
+       queue_head_t clutch_threads;
+       queue_init(&clutch_threads);
+
+       /* Interrupts need to be disabled to make sure threads wont become runnable during the
+        * migration and attempt to grab the pset/thread locks.
+        */
+       spl_t s = splsched();
+
+       pset_lock(src_pset);
+       for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
+               sched_clutch_bucket_t clutch_bucket = &(sched_clutch->sc_clutch_buckets[bucket]);
+               sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
+               if ((scb_root == NULL) || (scb_root->scr_pset == dst_pset)) {
+                       /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
+                       continue;
+               }
+               assert(scb_root->scr_pset == src_pset);
+               /* Now remove all the threads from the runq so that thread->runq is set correctly */
+               sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
+       }
+
+       /*
+        * Now that all the clutch buckets have been drained, update the TG recommendation.
+        * This operation needs to be done with the pset lock held to make sure that anyone
+        * coming in before the migration started would get the original pset as the root
+        * of this sched_clutch and attempt to hold the src_pset lock. Once the TG changes,
+        * all threads that are becoming runnable would find the clutch bucket empty and
+        * the TG recommendation would coax them to enqueue it in the new recommended
+        * hierarchy. This effectively synchronizes with other threads calling
+        * thread_setrun() and trying to decide which pset the thread/clutch_bucket
+        * belongs in.
+        */
+       thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation);
+       pset_unlock(src_pset);
+
+       /* Now setrun all the threads in the local queue */
+       qe_foreach_element_safe(thread, &clutch_threads, runq_links) {
+               remqueue(&thread->runq_links);
+               thread_lock(thread);
+               thread_setrun(thread, SCHED_TAILQ);
+               thread_unlock(thread);
+       }
+
+       splx(s);
+}
+
+static void
+sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation)
+{
+       /*
+        * For the clutch scheduler, the change in recommendation moves the thread group
+        * to the right hierarchy. sched_clutch_migrate_thread_group() is also responsible
+        * for updating the recommendation of the thread group.
+        */
+       sched_clutch_migrate_thread_group(&tg->tg_sched_clutch, new_recommendation);
+
+       if (new_recommendation != CLUSTER_TYPE_P) {
+               return;
+       }
+
+       sched_amp_bounce_thread_group_from_ecores(ecore_set, tg);
+}
+
+/*
+ * sched_clutch_migrate_foreign_buckets()
+ *
+ * Routine to migrate all the clutch buckets which are not in their recommended
+ * pset hierarchy now that a new pset has become runnable. The algorithm is
+ * similar to sched_clutch_migrate_thread_group().
+ *
+ * Invoked with the newly recommended pset lock held and interrupts disabled.
+ */
+static void
+sched_clutch_migrate_foreign_buckets(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
+{
+       thread_t thread;
+       processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set;
+
+       if (!sched_clutch_pset_available(dst_pset)) {
+               /*
+                * It is possible that some state about the pset changed,
+                * but its still not available for scheduling. Nothing to
+                * do here in that case.
+                */
+               if (drop_lock) {
+                       pset_unlock(dst_pset);
+               }
+               return;
+       }
+       pset_unlock(dst_pset);
+
+       queue_head_t clutch_threads;
+       queue_init(&clutch_threads);
+       sched_clutch_root_t src_root = &src_pset->pset_clutch_root;
+
+       pset_lock(src_pset);
+       queue_t clutch_bucket_list = &src_pset->pset_clutch_root.scr_foreign_buckets;
+
+       if (sched_clutch_root_count(src_root) == 0) {
+               /* No threads present in this hierarchy */
+               pset_unlock(src_pset);
+               goto migration_complete;
+       }
+
+       sched_clutch_bucket_t clutch_bucket;
+       qe_foreach_element_safe(clutch_bucket, clutch_bucket_list, scb_foreignlink) {
+               sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
+               assert(scb_root->scr_pset == src_pset);
+               /* Now remove all the threads from the runq so that thread->runq is set correctly */
+               sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
+               assert(clutch_bucket->scb_foreign == false);
+       }
+       pset_unlock(src_pset);
+
+       /* Now setrun all the threads in the local queue */
+       qe_foreach_element_safe(thread, &clutch_threads, runq_links) {
+               remqueue(&thread->runq_links);
+               thread_lock(thread);
+               thread_setrun(thread, SCHED_TAILQ);
+               thread_unlock(thread);
+       }
+
+migration_complete:
+       if (!drop_lock) {
+               pset_lock(dst_pset);
+       }
+}
+
+#endif /* __AMP__ */
 
 #endif /* CONFIG_SCHED_CLUTCH */
index 4cfad12f529ba7fba41cef8d68916abf3b090dd0..eef5bee4d667078d4638c65d8a4ca9bfde622407 100644 (file)
@@ -213,6 +213,10 @@ struct sched_clutch_bucket {
        /* (P) linkage for all clutch_buckets in a root bucket; used for tick operations */
        queue_chain_t                   scb_listlink;
 
+#if __AMP__
+       /* (P) linkage for all "foreign" clutch buckets in the root clutch */
+       queue_chain_t                   scb_foreignlink;
+#endif /* __AMP__ */
 
        /* (P) timestamp for the last time the interactivity score was updated */
        uint64_t                        scb_interactivity_ts;
index c312e0b4e04371d566984e16266c35102fe97a06..42e73b4f00bc30258bef6a244fb2a3557a362abd 100644 (file)
@@ -4139,7 +4139,11 @@ choose_processor(
                         * platforms, simply return the master_processor.
                         */
                        fallback_processor = true;
+#if CONFIG_SCHED_CLUTCH && __AMP__
+                       processor = processor_array[lsb_first(starting_pset->primary_map)];
+#else /* CONFIG_SCHED_CLUTCH && __AMP__ */
                        processor = master_processor;
+#endif /* CONFIG_SCHED_CLUTCH && __AMP__ */
                }
 
                /*
@@ -6069,6 +6073,11 @@ sched_update_pset_load_average(processor_set_t pset)
        pset->load_average = new_load_average;
 
 #if (DEVELOPMENT || DEBUG)
+#if __AMP__
+       if (pset->pset_cluster_type == PSET_AMP_P) {
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
+       }
+#endif
 #endif
 }
 
@@ -6272,5 +6281,29 @@ sysctl_task_get_no_smt(void)
 __private_extern__ void
 thread_bind_cluster_type(char cluster_type)
 {
+#if __AMP__
+       thread_t thread = current_thread();
+
+       spl_t s = splsched();
+       thread_lock(thread);
+       thread->sched_flags &= ~(TH_SFLAG_ECORE_ONLY | TH_SFLAG_PCORE_ONLY);
+       switch (cluster_type) {
+       case 'e':
+       case 'E':
+               thread->sched_flags |= TH_SFLAG_ECORE_ONLY;
+               break;
+       case 'p':
+       case 'P':
+               thread->sched_flags |= TH_SFLAG_PCORE_ONLY;
+               break;
+       default:
+               break;
+       }
+       thread_unlock(thread);
+       splx(s);
+
+       thread_block(THREAD_CONTINUE_NULL);
+#else /* __AMP__ */
        (void)cluster_type;
+#endif /* __AMP__ */
 }
index 880e849605f091339a7878dac1771549882c5579..9276e25636ef5b1d3b05c3ca889aaca9ee79698d 100644 (file)
@@ -585,6 +585,11 @@ extern boolean_t preemption_enabled(void);
 #error Enable at least one scheduler algorithm in osfmk/conf/MASTER.XXX
 #endif
 
+#if __AMP__
+extern const struct sched_dispatch_table sched_amp_dispatch;
+#define SCHED(f) (sched_amp_dispatch.f)
+
+#else /* __AMP__ */
 
 #if CONFIG_SCHED_CLUTCH
 extern const struct sched_dispatch_table sched_clutch_dispatch;
@@ -594,6 +599,7 @@ extern const struct sched_dispatch_table sched_dualq_dispatch;
 #define SCHED(f) (sched_dualq_dispatch.f)
 #endif /* CONFIG_SCHED_CLUTCH */
 
+#endif /* __AMP__ */
 
 struct sched_dispatch_table {
        const char *sched_name;
@@ -766,6 +772,9 @@ extern const struct sched_dispatch_table sched_traditional_with_pset_runqueue_di
 #if defined(CONFIG_SCHED_MULTIQ)
 extern const struct sched_dispatch_table sched_multiq_dispatch;
 extern const struct sched_dispatch_table sched_dualq_dispatch;
+#if __AMP__
+extern const struct sched_dispatch_table sched_amp_dispatch;
+#endif
 #endif
 
 #if defined(CONFIG_SCHED_PROTO)
index 832c774b4c5d2b445d9b1a61ff83cf23025dd093..fd98be48127fb6c98816554df59cfe0fed05d08e 100644 (file)
@@ -1295,7 +1295,11 @@ init_task_ledgers(void)
            task_wakeups_rate_exceeded, NULL, NULL);
        ledger_set_callback(t, task_ledgers.physical_writes, task_io_rate_exceeded, (void *)FLAVOR_IO_PHYSICAL_WRITES, NULL);
 
+#if XNU_MONITOR
+       ledger_template_complete_secure_alloc(t);
+#else /* XNU_MONITOR */
        ledger_template_complete(t);
+#endif /* XNU_MONITOR */
        task_ledger_template = t;
 }
 
@@ -5540,6 +5544,27 @@ task_energy(
        return energy;
 }
 
+#if __AMP__
+
+uint64_t
+task_cpu_ptime(
+       task_t  task)
+{
+       uint64_t cpu_ptime = 0;
+       thread_t thread;
+
+       task_lock(task);
+       cpu_ptime += task->total_ptime;
+
+       queue_iterate(&task->threads, thread, thread_t, task_threads) {
+               cpu_ptime += timer_grab(&thread->ptime);
+       }
+
+       task_unlock(task);
+       return cpu_ptime;
+}
+
+#else /* __AMP__ */
 
 uint64_t
 task_cpu_ptime(
@@ -5548,6 +5573,7 @@ task_cpu_ptime(
        return 0;
 }
 
+#endif /* __AMP__ */
 
 /* This function updates the cpu time in the arrays for each
  * effective and requested QoS class
index 7242faac767d8dcf2c26cf449b635c63b0668034..673259a16dc2c7fd633a30be45f15cbe3d8ade95 100644 (file)
@@ -283,6 +283,10 @@ struct thread {
 #define TH_SFLAG_BASE_PRI_FROZEN        0x0800          /* (effective) base_pri is frozen */
 #define TH_SFLAG_WAITQ_PROMOTED         0x1000          /* promote reason: waitq wakeup (generally for IPC receive) */
 
+#if __AMP__
+#define TH_SFLAG_ECORE_ONLY             0x2000          /* Bind thread to E core processor set */
+#define TH_SFLAG_PCORE_ONLY             0x4000          /* Bind thread to P core processor set */
+#endif
 
 #define TH_SFLAG_EXEC_PROMOTED          0x8000          /* promote reason: thread is in an exec */
 
index 83d485388596f539b7ca2a603eaa67b72fe8188b..a1b55f5eba2f6da3d2bff8f28e0220a9e61df0b4 100644 (file)
@@ -309,8 +309,8 @@ routine host_register_well_known_mach_voucher_attr_manager(
  * Update the global ATM diagnostic flag, readable from the commpage
  */
 routine host_set_atm_diagnostic_flag(
-        host_priv      : host_priv_t;
-    in  diagnostic_flag : uint32_t);
+               host            : host_t;
+       in      diagnostic_flag : uint32_t);
 
 #if !KERNEL && LIBSYSCALL_INTERFACE
 routine host_get_atm_diagnostic_flag(
index 654bfc30dbde79c96db74f23525b3d75d223ca0c..6865cee72ee5525898f33a8b7fd988dca10e250e 100644 (file)
@@ -423,6 +423,9 @@ __END_DECLS
 #define CPUFAMILY_ARM_HURRICANE         0x67ceee93
 #define CPUFAMILY_ARM_MONSOON_MISTRAL   0xe81e7ef6
 #define CPUFAMILY_ARM_VORTEX_TEMPEST    0x07d34b9f
+#ifndef RC_HIDE_XNU_LIGHTNING
+#define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
+#endif /* !RC_HIDE_XNU_LIGHTNING */
 
 /* The following synonyms are deprecated: */
 #define CPUFAMILY_INTEL_6_23    CPUFAMILY_INTEL_PENRYN
diff --git a/osfmk/man/index.html b/osfmk/man/index.html
new file mode 100644 (file)
index 0000000..2a9d0ff
--- /dev/null
@@ -0,0 +1,448 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<head>
+  <title>Mach Kernel Interface Reference Manual</title>
+</head>
+<body>
+<h3>Mach IPC Interface</h3>
+<blockquote>
+<p>
+Mach IPC presents itself in a few forms: message queues, lock-sets, 
+and semaphores (more may be added in the future). &nbsp;All share one common 
+charateristic: the capabilities presented by each are represented through 
+a handle known as a Mach port. &nbsp;Specific rights represented in these 
+Mach port capability handles allow the underlying IPC object to be used and 
+manipulated in consistent ways.</p>
+
+<h4>Mach Message Queue Interface</h4>
+<blockquote>
+<p>
+<a href="mach_msg.html">mach_msg</a> - Send and/or receive a message from the target port.<br>
+<a href="mach_msg.html">mach_msg_overwrite</a> - Send and/or receive messages with possible overwrite.<br>
+</p>
+Mach Message Queue Data Structures
+<p>
+<a href="mach_msg_descriptor.html">mach_msg_descriptor</a> - Specifies an element of a complex IPC message.<br>
+<a href="mach_msg_header.html">mach_msg_header</a> - Specifies the content of an IPC message header.<br>
+</p>
+</blockquote>
+
+<h4>Mach Lock-Set Interface</h4>
+<blockquote>
+<p>
+<a href="lock_acquire.html">lock_acquire</a> - Acquire ownership a lock<br>     
+<a href="lock_handoff.html">lock_handoff</a> - Hand-off ownership of a lock.<br>      
+<a href="lock_handoff_accept.html">lock_handoff_accept</a> - Accept lock ownership from a handoff.<br>      
+<a href="lock_make_stable.html">lock_make_stable</a> - Stabilize the state of the specified lock.<br>
+<a href="lock_release.html">lock_release</a> - Release ownership of a lock.<br>
+<a href="lock_set_create.html">lock_set_create</a> - Create a new lock set.<br>
+<a href="lock_set_destroy.html">lock_set_destroy</a> - Destroy a lock set and its associated locks.<br>
+<a href="lock_try.html">lock_try</a> - Attempt to acquire access rights to a lock.<br>
+</p>
+</blockquote>
+
+<h4>Mach Semaphore Interface</h4>
+<blockquote>
+<p>
+<a href="semaphore_create.html">semaphore_create</a> - Create a new semaphore.<br>
+<a href="semaphore_destroy.html">semaphore_destroy</a> - Destroy a semaphore.<br>
+<a href="semaphore_signal.html">semaphore_signal</a> - Increments the semaphore count.<br>
+<a href="semaphore_signal_all.html">semaphore_signal_all</a> - Wake up all threads blocked on a semaphore.<br>
+<a href="semaphore_wait.html">semaphore_wait</a> - Wait on the specified semaphore.<br>
+</p>
+</blockquote>
+
+<h4>Mach Port Management Interface</h4>
+<blockquote>
+<p>
+<a href="mach_port_allocate.html">mach_port_allocate</a> - Create caller-specified type of port right.<br>
+<a href="mach_port_allocate_full.html">mach_port_allocate_full</a> - Create a port right with full Mach port semantics.<br>
+<a href="mach_port_allocate_name.html">mach_port_allocate_name</a> - Create a port right with the caller-specified name.<br>
+<a href="mach_port_allocate_qos.html">mach_port_allocate_qos</a> - Allocate a port with specified "quality of service".<br>
+<a href="MP_allocate_subsystem.html">mach_port_allocate_subsystem</a> - Create a port right associated with the caller-specified subsystem.<br>
+<a href="mach_port_deallocate.html">mach_port_deallocate</a> - Decrement the target port right's user reference count.<br>
+<a href="mach_port_destroy.html">mach_port_destroy</a> - Deallocate all port rights associated with specified name.<br>
+<a href="mach_port_extract_right.html">mach_port_extract_right</a> - Remove the specified right from the target task and return it to the caller.<br>
+<a href="mach_port_get_attributes.html">mach_port_get_attributes</a> - Return information about target port as specified by the caller.<br>
+<a href="mach_port_get_refs.html">mach_port_get_refs</a> - Return the current count of user references on the target port right.<br>
+<a href="mach_port_get_set_status.html">mach_port_get_set_status</a> - Return the port right names contained in the target port set.<br>
+<a href="mach_port_insert_right.html">mach_port_insert_right</a> - Insert the specified port right into the target task.<br>
+<a href="mach_port_mod_refs.html">mach_port_mod_refs</a> - Modify the specified port right's count of user references.<br>
+<a href="mach_port_move_member.html">mach_port_move_member</a> - Move the specified receive right into or out of the specified port set.<br>
+<a href="mach_port_names.html">mach_port_names</a> - Return information about a task's port name space.<br>
+<a href="MP_request_notification.html">mach_port_request_notification</a> - Request notification of the specified port event type.<br>
+<a href="mach_port_set_attributes.html">mach_port_set_attributes</a> - Set the target port's attributes.<br>
+<a href="mach_port_set_mscount.html">mach_port_set_mscount</a> - Change the target port's make-send count.<br>
+<a href="mach_port_set_seqno.html">mach_port_set_seqno</a> - Change the current value of the target port's sequence number.<br>
+<a href="mach_port_type.html">mach_port_type</a> - Return the characteristics of the target port name.<br>
+<a href="mach_reply_port.html">mach_reply_port</a> - Allocate a new port and insert corresponding receive right in the calling task.<br>
+<a href="mach_subsystem_create.html"> mach_subsystem_create</a> - Used by a server to register information about an RPC subsystem with the kernel.<br>
+</p>
+Mach Port Data Structures
+<p>
+<a href="mach_port_limits.html">mach_port_limits</a> - Specifies a port's resource and message queue limits.<br>
+<a href="mach_port_qos.html">mach_port_qos</a> - Specifies a port's attributes with respect to "Quality Of Service."<br>
+<a href="mach_port_status.html">mach_port_status</a> - Used to present a port's current status with respect to various important attributes.<br>
+</p>
+Mach Port Notification Callbacks
+<p>
+<a href="do_mach_notify_dead_name.html">do_mach_notify_dead_name</a> - Handle the current instance of a dead-name notification.<br>
+<a href="do_mach_notify_no_senders.html">do_mach_notify_no_senders</a> - Handle the current instance of a no-more-senders notification.<br>
+<a href="DMN_port_deleted.html">do_mach_notify_port_deleted</a> - Handle the current instance of a port-deleted notification.<br>
+<a href="DMN_port_destroyed.html">do_mach_notify_port_destroyed</a> - Handle the current instance of a port-destroyed notification.<br>
+<a href="do_mach_notify_send_once.html">do_mach_notify_send_once</a> - Handle the current instance of a send-once notification.<br>
+</p>
+Mach Port Notification Callback Server Helpers
+<p>
+<a href="notify_server.html">notify_server</a> - Detect and handle a kernel-generated IPC notification.<br>
+</p>
+</blockquote>
+
+</blockquote>
+
+<h3>Mach Virtual Memory Interface</h3>
+<blockquote>
+<h4>Mach Virtual Memory Address Space Manipulation Interface</h4>
+<blockquote>
+<p>
+<a href="host_page_size.html">host_page_size</a> - Provide the system's virtual page size.<br>
+<a href="vm_allocate.html">vm_allocate</a> - Allocate a region of virtual memory.<br>
+<a href="vm_behavior_set.html">vm_behavior_set</a> - Specify expected access patterns for the target VM region.<br>
+<a href="vm_copy.html">vm_copy</a> - Copy a region of virtual memory.<br>
+<a href="vm_deallocate.html">vm_deallocate</a> - Deallocate a region of virtual memory.<br>
+<a href="vm_inherit.html">vm_inherit</a> - Set a VM region's inheritance attribute.<br>
+<a href="vm_machine_attribute.html">vm_machine_attribute</a> - Get/set the target memory region's special attributes.<br>
+<a href="vm_map.html">vm_map</a> - Map the specified memory object to a region of virtual memory.<br>
+<a href="vm_msync.html">vm_msync</a> - Synchronize the specified region of virtual memory.<br>
+<a href="vm_protect.html">vm_protect</a> - Set access privilege attribute for a region of virtual memory.<br>
+<a href="vm_read.html">vm_read</a> - Read the specified range of target task's address space.<br>
+<a href="vm_region.html">vm_region</a> - Return description of a virtual memory region.<br>
+<a href="vm_remap.html">vm_remap</a> - Map memory objects in one address space to that of another's.<br>
+<a href="vm_wire.html"> vm_wire</a> - Modify the target region's paging characteristics.<br>
+<a href="vm_write.html">vm_write</a> - Write data to the specified address in the target address space.<br>
+</p>
+Data Structures
+<p>
+<a href="vm_region_basic_info.html">vm_region_basic_info</a> - Defines the attributes of a task's memory region.<br>
+<a href="vm_statistics.html">vm_statistics</a> - Defines statistics for the kernel's use of virtual memory.<br>
+</p>
+</blockquote>
+
+<h4>External Memory Management Interface</h4>
+<blockquote>
+The External Memory Management Interface (EMMI) is undergoing significant change in the Darwin system.
+For this reason, the interface is not currently available to user-level programs.  Even for kernel
+extensions, use of these interfaces in not supported.  Instead, the BSD filesystem's Universal Buffer Cache (UBC)
+mechanism should be used.<br>
+<p>
+<a href="MO_change_attributes.html">memory_object_change_attributes</a> - Modify subset of memory object attributes.<br>
+<a href="memory_object_destroy.html">memory_object_destroy</a> - Shut down a memory object.<br>
+<a href="MO_get_attributes.html">memory_object_get_attributes</a> - Return current attributes for a memory object.<br>
+<a href="memory_object_lock_request.html">memory_object_lock_request</a> - Restrict access to memory object data.<br>
+<a href="MO_SY_completed.html">memory_object_synchronize_completed</a> - Synchronized data has been processed.<br>
+</p>
+Data Structures
+<p>
+<a href="memory_object_attr_info.html">memory_object_attr_info</a> - Defines memory object attributes.<br>
+<a href="memory_object_perf_info.html">memory_object_perf_info</a>- Specifies performance-related memory object attributes.<br>
+</p>
+External Memory Manager Interface Callbacks
+<p>
+<a href="memory_object_create.html">memory_object_create</a> - Assign a new memory object to the default memory manager.<br>
+<a href="MO_data_initialize.html">memory_object_data_initialize</a> - Provide initial data for a new memory object.<br>
+<a href="memory_object_data_request.html">memory_object_data_request</a> - Request that memory manager page-in specified data.<br>
+<a href="memory_object_data_return.html">memory_object_data_return</a> - Return memory object data to the appropriate memory manager.<br>
+<a href="memory_object_data_unlock.html">memory_object_data_unlock</a> - Request a memory manager release the lock on specific data.<br>
+<a href="memory_object_init.html">memory_object_init</a> - Inform a memory manager on first use of a memory object.<br>
+<a href="memory_object_synchronize.html">memory_object_synchronize</a> - Request synchronization of data with backing store.<br>
+<a href="memory_object_terminate.html">memory_object_terminate</a> - Relinquish access to a memory object.<br>
+</p>
+EMMI Callback Server Helpers
+<p>
+<a href="MO_default_server.html">memory_object_default_server</a> - Handle kernel operation request targeted for the default pager.<br>
+<a href="memory_object_server.html">memory_object_server</a> - Handle kernel operation request aimed at a given memory manager.<br>
+</p>
+</blockquote>
+
+<h4>Default Memory Management Interface</h4>
+<blockquote>
+<p>
+<a href="default_pager_add_segment.html">default_pager_add_segment</a> - Add additional backing storage for a default pager.<br>
+<a href="DP_backing_store_create.html">default_pager_backing_store_create</a> - Create a backing storage object.<br>
+<a href="DP_backing_store_delete.html"> default_pager_backing_store_delete</a> - Delete a backing storage object.<br>
+<a href="DP_backing_store_info.html">default_pager_backing_store_info</a> - Return information about a backing storage object.<br>
+<a href="default_pager_info.html">default_pager_info</a> - Furnish caller with information about the default pager.<br>
+<a href="DP_object_create.html">default_pager_object_create</a> - Initialize a non-persistent memory object.<br>
+<a href="HD_memory_manager.html">host_default_memory_manager</a> - Register/Lookup the host's default pager.<br>
+</p>
+</blockquote>
+
+</blockquote>
+
+<h3>Process Management Interface</h3>
+<blockquote>
+
+<h4>Task Interface</h4>
+<blockquote>
+<p>
+<a href="mach_ports_lookup.html">mach_ports_lookup</a> - Provide caller with an array of the target task's well-known ports.<br>
+<a href="mach_ports_register.html">mach_ports_register</a> - Register an array of well-known ports on behalf of the target task.<br>
+<a href="mach_task_self.html">mach_task_self</a> - Return a send right to the caller's task_self port.<br>
+<a href="task_create.html">task_create</a> - Create a new task.<br>
+<a href="task_get_emulation_vector.html">task_get_emulation_vector</a> - Return an array identifying the target task's user-level system call handlers.<br>
+<a href="task_get_exception_ports.html">task_get_exception_ports</a> - Return send rights to the target task's exception ports.<br>
+<a href="task_get_special_port.html">task_get_special_port</a> - Return a send write to the indicated special port.<br>
+<a href="task_info.html">task_info</a> - Return per-task information according to specified flavor.<br>
+<a href="task_resume.html">task_resume</a> - Decrement the target task's suspend count.<br>
+<a href="task_sample.html">task_sample</a> - Sample the target task's thread program counters periodically.<br>
+<a href="task_set_emulation.html">task_set_emulation</a> - Establish a user-level handler for a system call.<br>
+<a href="task_set_emulation_vector.html">task_set_emulation_vector</a> - Establish the target task's user-level system call handlers.<br>
+<a href="task_set_exception_ports.html">task_set_exception_ports</a> - Set target task's exception ports.<br>
+<a href="task_set_info.html">task_set_info</a> - Set task-specific information state.<br>
+<a href="task_set_port_space.html">task_set_port_space</a> - Set the size of the target task's port name space table.<br>
+<a href="task_set_special_port.html">task_set_special_port</a> - Set the indicated special port.<br>
+<a href="task_suspend.html">task_suspend</a> - Suspend the target task.<br>
+<a href="task_swap_exception_ports.html">task_swap_exception_ports</a> - Set target task's exception ports, returning the previous exception ports.<br>
+<a href="task_terminate.html">task_terminate</a> - Terminate the target task and deallocate its resources.<br>
+<a href="task_threads.html">task_threads</a> - Return the target task's list of threads.<br>
+</p>
+Task Data Structures
+<p>
+<a href="task_basic_info.html">task_basic_info</a> - Defines basic information for a task.<br>
+<a href="task_thread_times_info.html">task_thread_times_info</a> - Defines thread execution times information for tasks.<br>
+</p>
+</blockquote>
+
+<h4>Thread Interface</h4>
+<blockquote>
+<p>
+<a href="mach_thread_self.html">mach_thread_self</a> - Returns the thread self port.<br>
+<a href="thread_abort.html">thread_abort</a> - Abort a thread.<br>
+<a href="thread_abort_safely.html">thread_abort_safely</a> - Abort a thread, restartably.<br>
+<a href="thread_create.html">thread_create</a> - Create a thread within a task.<br>
+<a href="thread_create_running.html">thread_create_running</a> - Optimized creation of a running thread.<br>
+<a href="thread_depress_abort.html">thread_depress_abort</a> - Cancel thread scheduling depression.<br>
+<a href="thread_get_exception_ports.html">thread_get_exception_ports</a> - Return a send right to an exception port.<br>
+<a href="thread_get_special_port.html">thread_get_special_port</a> - Return a send right to the caller-specified special port.<br>
+<a href="thread_get_state.html">thread_get_state</a> - Return the execution state for a thread.<br>
+<a href="thread_info.html">thread_info</a> - Return information about a thread.<br>
+<a href="thread_resume.html">thread_resume</a> - Resume a thread.<br>
+<a href="thread_sample.html">thread_sample</a> - Perform periodic PC sampling for a thread.<br>
+<a href="thread_set_exception_ports.html">thread_set_exception_ports</a> - Set exception ports for a thread.<br>
+<a href="thread_set_special_port.html">thread_set_special_port</a> - Set caller-specified special port belonging to the target thread.<br>
+<a href="thread_set_state.html">thread_set_state</a> - Set the target thread's user-mode execution state.<br>
+<a href="thread_suspend.html">thread_suspend</a> - Suspend a thread.<br>
+<a href="TS_exception_ports.html">thread_swap_exception_ports</a> - Swap exception ports for a thread.<br>
+<a href="thread_terminate.html">thread_terminate</a> - Destroy a thread.<br>
+<a href="thread_wire.html">thread_wire</a> - Mark the thread as privileged with respect to kernel resources.<br>
+</p>
+Thread Data Structures
+<p>
+<a href="thread_basic_info.html">thread_basic_info</a> - Defines basic information for a thread.<br>
+</p>
+Thread Exception Callbacks
+<p>
+<a href="catch_exception_raise.html">catch_exception_raise</a> - Handles the occurrence of an exception within a thread.<br>
+</p>
+Thread Exception Callback Server Helpers
+<p>
+<a href="exc_server.html">exc_server</a> - Handle kernel-reported thread exception.<br>
+</p>
+</blockquote>
+
+<h4>Scheduling Interface</h4>
+<blockquote>
+<p>
+<a href="task_policy.html">task_policy</a> - Set target task's default scheduling policy state.<br>
+<a href="task_set_policy.html">task_set_policy</a> - Set target task's default scheduling policy state.<br>
+<a href="thread_policy.html">thread_policy</a> - Set target thread's scheduling policy state.<br>
+<a href="thread_set_policy.html">thread_set_policy</a> - Set target thread's scheduling policy state.<br>
+<a href="thread_switch.html">thread_switch</a> - Cause context switch with options.<br>
+</p>
+Scheduling Data Structures
+<p>
+<a href="policy_fifo_info.html">policy_fifo_info</a> - Specifies information associated with the system's First-In-First-Out scheduling policy.<br>
+<a href="policy_rr_info.html">policy_rr_info</a> - Specifies information associated with the system's Round Robin scheduling policy.<br>
+<a href="policy_timeshare_info.html">policy_timeshare_info</a> - Specifies information associated with the system's Timeshare scheduling policy.<br>
+</p>
+</blockquote>
+</blockquote>
+
+<h3>System Management Interface</h3>
+<blockquote>
+
+<h4>Host Interface</h4>
+<blockquote>
+<p>
+<a href="host_get_clock_service.html">host_get_clock_service</a> - Return a send right to a kernel clock's service port.<br>
+<a href="host_get_time.html">host_get_time</a> - Returns the current time as seen by that host.<br>
+<a href="host_info.html">host_info</a> - Return information about a host.<br>
+<a href="host_kernel_version.html">host_kernel_version</a> - Return kernel version information for a host.<br>
+<a href="host_statistics.html">host_statistics</a> - Return statistics for a host.<br>
+<a href="mach_host_self.html">mach_host_self</a> - Returns send rights to the task's host self port.<br>
+</p>
+Data Structures
+<p>
+<a href="host_basic_info.html">host_basic_info</a> - Used to present basic information about a host.<br>
+<a href="host_load_info.html">host_load_info</a> - Used to present a host's processor load information.<br>
+<a href="host_sched_info.html">host_sched_info</a> -  - Used to present the set of scheduler limits associated with the host.<br>
+<a href="kernel_resource_sizes.html">kernel_resource_sizes</a> - Used to present the sizes of kernel's major structures.<br>
+</p>
+</blockquote>
+
+<h4>Host Control Interface</h4>
+<blockquote>
+<p>
+<a href="host_adjust_time.html">host_adjust_time</a> - Arranges for the time on a specified host to be gradually changed by an adjustment value.<br>
+<a href="HD_memory_manager.html">host_default_memory_manager</a> - Set the default memory manager.<br>
+<a href="host_get_boot_info.html">host_get_boot_info</a> - Return operator boot information.<br>
+<a href="host_get_clock_control.html">host_get_clock_control</a> - Return a send right to a kernel clock's control port.<br>
+<a href="host_processor_slots.html">host_processor_slots</a> - Return a list of numbers that map processor slots to active processors.<br>
+<a href="host_processors.html">host_processors</a> - Return a list of send rights representing all processor ports.<br>
+<a href="host_reboot.html">host_reboot</a> - Reboot this host.<br>
+<a href="host_set_time.html">host_set_time</a> - Establishes the time on the specified host.<br>
+</p>
+</blockquote>
+
+<h4>Host Security Interface</h4>
+<blockquote>
+<p>
+<a href="host_security_create_task_token.html">host_security_create_task_token</a> - Create a new task with an explicit security token.<br>
+<a href="host_security_set_task_token.html">host_security_set_task_token</a> - Change the target task's security token.<br>
+</p>
+</blockquote>
+
+<h4>Resource Accounting Interface</h4>
+<blockquote>
+<i>
+The Mach resource accounting mechanism is not functional in the current Mac OS X/Darwin system.  It will become functional in a future release.
+</i>
+<p>
+<a href="ledger_create.html">ledger_create</a> - Create a subordinate ledger.<br>
+<a href="ledger_read.html">ledger_read</a> - Return the ledger limit and balance.<br>
+<a href="ledger_terminate.html">ledger_terminate</a> - Destroy a ledger.<br>
+<a href="ledger_transfer.html">ledger_transfer</a> - Transfer resources from a parent ledger to a child.<br>
+</p>
+</blockquote>
+
+<h4>Processor Management Interface</h4>
+<blockquote>
+<p>
+<a href="processor_control.html">processor_control</a> - Perform caller-specified operation on target processor.<br>
+<a href="processor_exit.html">processor_exit</a> - Exit a processor.<br>
+<a href="processor_info.html">processor_info</a> - Return information about a processor.<br>
+<a href="processor_start.html">processor_start</a> - Start a processor.<br>
+</p>
+Processor Data Structures
+<p>
+<a href="processor_basic_info.html">processor_basic_info</a> - Defines the basic information about a processor.<br>
+</p>
+</blockquote>
+
+<h4>Processor Set Interface</h4>
+<blockquote>
+<i>
+The processor set interface allows for the grouping of tasks and
+processors for the purpose of exclusive scheduling.  These interface
+are <b>deprecated</b> and should not be used in code that isn't tied
+to a particular release of Mac OS X/Darwin.  These will likely change
+or disappear in a future release.
+</i>
+<p>
+<a href="host_processor_sets.html">host_processor_sets</a> - Return a list of send rights representing all processor set name ports.<br>
+<a href="host_processor_set_priv.html">host_processor_set_priv</a> - Translate a processor set name port into a processor set control port.<br>
+<a href="processor_assign.html">processor_assign</a> - Assign a processor to a processor set.<br>
+<a href="processor_get_assignment.html">processor_get_assignment</a> - Get current assignment for a processor.<br>
+<a href="processor_set_create.html">processor_set_create</a> - Create a new processor set.<br>
+<a href="processor_set_default.html">processor_set_default</a> - Return the default processor set.<br>
+<a href="processor_set_destroy.html">processor_set_destroy</a> - Destroy the target processor set.<br>
+<a href="processor_set_info.html">processor_set_info</a> - Return processor set state according to caller-specified flavor.<br>
+<a href="processor_set_max_priority.html">processor_set_max_priority</a> - Sets the maximum scheduling priority for a processor set.<br>
+<a href="P_set_policy_control.html">processor_set_policy_control</a> - Set target processor set's scheduling policy state.<br>
+<a href="P_set_policy_disable.html">processor_set_policy_disable</a> - Enables a scheduling policy for a processor set.<br>
+<a href="P_set_policy_enable.html">processor_set_policy_enable</a> - Enables a scheduling policy for a processor set.<br>
+<a href="processor_set_statistics.html">processor_set_statistics</a> - Return scheduling statistics for a processor set.<br>
+<a href="processor_set_tasks.html">processor_set_tasks</a> - Return all tasks currently assigned to the target processor set.<br>
+<a href="processor_set_threads.html">processor_set_threads</a> - Return all threads currently assigned to the target processor set.<br>
+<a href="task_assign.html">task_assign</a> - Assign a task to a processor set.<br>
+<a href="task_assign_default.html">task_assign_default</a> -  Assign a task to the default processor set.<br>
+<a href="task_get_assignment.html">task_get_assignment</a> - Create a new task with an explicit security token.<br>
+<a href="thread_assign.html">thread_assign</a> - Assign a thread to a processor set.<br>
+<a href="thread_assign_default.html">thread_assign_default</a> - Assign a thread to the default processor set.<br>
+<a href="thread_get_assignment.html">thread_get_assignment</a> - Return the processor set to which a thread is assigned.<br>
+</p>
+Processor Set Data Structures
+<p>
+<a href="processor_set_basic_info.html">processor_set_basic_info</a> - Defines the basic information about a processor set.<br>
+<a href="processor_set_load_info.html">processor_set_load_info</a> - Defines the scheduling statistics for a processor set.<br>
+</p>
+</blockquote>
+
+<h4>Clock Interface</h4>
+<blockquote>
+<p>
+<a href="clock_alarm.html">clock_alarm</a> - Set up an alarm.<br>
+<a href="clock_get_attributes.html">clock_get_attributes</a> - Return attributes of a clock.<br>
+<a href="clock_get_time.html">clock_get_time</a> - Return the current time.<br>
+<a href="clock_map_time.html">clock_map_time</a> - Return a memory object that maps a clock.<br>
+<a href="clock_set_attributes.html">clock_set_attributes</a> - Set a particular clock's attributes.<br>
+<a href="clock_set_time.html">clock_set_time</a> - Set the current time.<br>
+<a href="clock_sleep.html">clock_sleep</a> - Delay the invoking thread until a specified time.<br>
+</p>
+Clock Data Structures
+<p>
+<a href="mapped_tvalspec.html">mapped_tvalspec</a> - Specifies the format the kernel uses to maintain a mapped clock's time.<br>
+<a href="tvalspec.html">tvalspec</a> - Defines format of system time values.<br>
+</p>
+Clock Interface Callbacks
+<p>
+<a href="clock_alarm_reply.html">clock_alarm_reply</a> - Ring a preset alarm.<br>
+</p>
+Clock Callback Server Helpers
+<p>
+<a href="clock_reply_server.html"> clock_reply_server</a> - Handle kernel-generated alarm.<br>
+</p>
+</blockquote>
+
+<h4>Multi-Computer Support Interface</h4>
+<blockquote>
+<i>
+These multi-computer support interfaces are no longer supported by
+the Mac OS X/Darwin kernel.  If and when multi-computer support is
+added back in, something like these will likely be added.
+</i>
+<p>
+<a href="host_page_size.html">host_page_size</a> - Returns the page size for the given host.<br>
+<a href="ledger_get_remote.html">ledger_get_remote</a> - Return send right to specified host's remote ledger port.<br>
+<a href="ledger_set_remote.html">ledger_set_remote</a> - Set this host's remote ledger port.<br>
+</p>
+</blockquote>
+
+</blockquote>
+
+<h3>Machine Specific Interface</h3>
+<blockquote>
+
+<h4>Intel 386 Support</h4>
+<blockquote>
+<p>
+<a href="i386_get_ldt.html">i386_get_ldt</a> - Returns per-thread segment descriptors from the local descriptor table (LDT).<br>
+<a href="i386_io_port_add.html">i386_io_port_add</a> - Adds a device to the I/O permission bitmap for a thread. <br>
+<a href="i386_io_port_list.html">i386_io_port_list</a> - Returns a list of the devices named in the thread's I/O permission bitmap.<br>
+<a href="i386_io_port_remove.html">i386_io_port_remove</a> - Removes the specified device from the thread's I/O permission bitmap.<br>
+<a href="i386_set_ldt.html">i386_set_ldt</a> - Allows a thread to have a private local descriptor table (LDT).<br>
+</p>
+</blockquote>
+
+<h4>PowerPC Support</h4>
+<blockquote>
+<p>
+</p>
+</blockquote>
+
+</blockquote>
+
+</BODY>
+
+</HTML>
+
index 99624e77e5867eafedeec9766a7c07422df66233..a12ca10c8b7b235b42f3d8b475a6bf198460ce1d 100644 (file)
@@ -32,6 +32,9 @@
 #include <kern/thread.h>
 #if defined(__arm64__)
 #include <pexpert/arm64/board_config.h>
+#if XNU_MONITOR
+#include <arm64/ppl/tests/shart.h>
+#endif
 #endif
 
 extern ledger_template_t task_ledger_template;
@@ -122,7 +125,152 @@ test_pmap_enter_disconnect(unsigned int num_loops)
 kern_return_t
 test_pmap_iommu_disconnect(void)
 {
+#if XNU_MONITOR
+       kern_return_t kr = KERN_SUCCESS;
+       pmap_t new_pmap = pmap_create_wrapper(0);
+
+       vm_page_t m = vm_page_grab();
+
+       vm_page_lock_queues();
+       if (m != VM_PAGE_NULL) {
+               vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
+       }
+       vm_page_unlock_queues();
+
+       shart_ppl *iommu = NULL;
+       kr = pmap_iommu_init(shart_get_desc(), "sharttest0", NULL, 0, (ppl_iommu_state**)(&iommu));
+
+       if (kr != KERN_SUCCESS) {
+               goto cleanup;
+       }
+
+       if ((new_pmap == NULL) || (m == VM_PAGE_NULL) || (iommu == NULL)) {
+               kr = KERN_FAILURE;
+               goto cleanup;
+       }
+
+       ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
+
+       const ppl_iommu_seg shart_segs[] = {
+               {.iova = 0,
+                .paddr = ptoa(phys_page),
+                .nbytes = PAGE_SIZE,
+                .prot = VM_PROT_READ,
+                .refcon = 0},
+
+               {.iova = 1,
+                .paddr = ptoa(phys_page),
+                .nbytes = PAGE_SIZE,
+                .prot = VM_PROT_READ | VM_PROT_WRITE,
+                .refcon = 0},
+
+               {.iova = 2,
+                .paddr = ptoa(phys_page),
+                .nbytes = PAGE_SIZE,
+                .prot = VM_PROT_READ,
+                .refcon = 0},
+
+               {.iova = 3,
+                .paddr = ptoa(phys_page),
+                .nbytes = PAGE_SIZE,
+                .prot = VM_PROT_READ,
+                .refcon = 0}
+       };
+
+       /* Phase 1: one CPU mapping */
+       kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       assert(kr == KERN_SUCCESS);
+       assert(!pmap_verify_free(phys_page));
+       pmap_disconnect(phys_page);
+       assert(pmap_verify_free(phys_page));
+
+       /* Phase 2: two CPU mappings */
+       kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       assert(kr == KERN_SUCCESS);
+       kr = pmap_enter(new_pmap, PMAP_TEST_VA + PAGE_SIZE, phys_page, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       assert(kr == KERN_SUCCESS);
+       assert(!pmap_verify_free(phys_page));
+       pmap_disconnect(phys_page);
+       assert(pmap_verify_free(phys_page));
+
+       /* Phase 3: one IOMMU mapping */
+       kr = pmap_iommu_map(&iommu->super, shart_segs, 1, 0, NULL);
+       assert(kr == KERN_SUCCESS);
+       assert(!pmap_verify_free(phys_page));
+       pmap_disconnect(phys_page);
+       assert(!pmap_verify_free(phys_page));
+       pmap_iommu_unmap(&iommu->super, shart_segs, 1, 0, NULL);
+       assert(pmap_verify_free(phys_page));
+
+       /* Phase 4: two IOMMU mappings */
+       kr = pmap_iommu_map(&iommu->super, shart_segs, 2, 0, NULL);
+       assert(kr == KERN_SUCCESS);
+       assert(!pmap_verify_free(phys_page));
+       pmap_disconnect(phys_page);
+       assert(!pmap_verify_free(phys_page));
+       pmap_iommu_unmap(&iommu->super, &shart_segs[1], 1, 0, NULL);
+       assert(!pmap_verify_free(phys_page));
+       pmap_disconnect(phys_page);
+       assert(!pmap_verify_free(phys_page));
+       pmap_iommu_unmap(&iommu->super, shart_segs, 1, 0, NULL);
+       assert(pmap_verify_free(phys_page));
+
+       /* Phase 5: combined CPU and IOMMU mappings */
+       kr = pmap_iommu_map(&iommu->super, shart_segs, 1, 0, NULL);
+       assert(kr == KERN_SUCCESS);
+       kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       assert(kr == KERN_SUCCESS);
+       kr = pmap_iommu_map(&iommu->super, &shart_segs[1], 2, 0, NULL);
+       assert(kr == KERN_SUCCESS);
+       kr = pmap_enter(new_pmap, PMAP_TEST_VA + PAGE_SIZE, phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       assert(kr == KERN_SUCCESS);
+       kr = pmap_iommu_map(&iommu->super, &shart_segs[3], 1, 0, NULL);
+       assert(kr == KERN_SUCCESS);
+       assert(!pmap_verify_free(phys_page));
+       pmap_disconnect(phys_page);
+       assert(!pmap_verify_free(phys_page));
+       pmap_iommu_unmap(&iommu->super, shart_segs, 4, 0, NULL);
+       assert(pmap_verify_free(phys_page));
+
+       /* Phase 6: differently combined CPU and IOMMU mappings */
+       kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       assert(kr == KERN_SUCCESS);
+       kr = pmap_iommu_map(&iommu->super, &shart_segs[1], 3, 0, NULL);
+       assert(kr == KERN_SUCCESS);
+       kr = pmap_enter(new_pmap, PMAP_TEST_VA + PAGE_SIZE, phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       assert(kr == KERN_SUCCESS);
+       kr = pmap_iommu_map(&iommu->super, shart_segs, 1, 0, NULL);
+       assert(kr == KERN_SUCCESS);
+       kr = pmap_enter(new_pmap, PMAP_TEST_VA + (2 * PAGE_SIZE), phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+       assert(kr == KERN_SUCCESS);
+       assert(!pmap_verify_free(phys_page));
+       pmap_iommu_unmap(&iommu->super, &shart_segs[2], 1, 0, NULL);
+       assert(!pmap_verify_free(phys_page));
+       pmap_disconnect(phys_page);
+       assert(!pmap_verify_free(phys_page));
+       pmap_iommu_unmap(&iommu->super, shart_segs, 4, 0, NULL);
+       assert(pmap_verify_free(phys_page));
+       pmap_disconnect(phys_page);
+       assert(pmap_verify_free(phys_page));
+
+cleanup:
+
+       if (iommu != NULL) {
+               pmap_iommu_ioctl(&iommu->super, SHART_IOCTL_TEARDOWN, NULL, 0, NULL, 0);
+       }
+       vm_page_lock_queues();
+       if (m != VM_PAGE_NULL) {
+               vm_page_free(m);
+       }
+       vm_page_unlock_queues();
+       if (new_pmap != NULL) {
+               pmap_destroy(new_pmap);
+       }
+
+       return kr;
+#else
        return KERN_SUCCESS;
+#endif
 }
 
 kern_return_t
index 305c8d67700ad6386fba685f4475045d90215f71..6146c8e40417985bfcd06669603a5a66616df055 100644 (file)
@@ -15951,6 +15951,13 @@ RestartCopy:
                if (!copy) {
                        if (src_entry->used_for_jit == TRUE) {
                                if (same_map) {
+#if __APRR_SUPPORTED__
+                                       /*
+                                        * Disallow re-mapping of any JIT regions on APRR devices.
+                                        */
+                                       result = KERN_PROTECTION_FAILURE;
+                                       break;
+#endif /* __APRR_SUPPORTED__*/
                                } else {
 #if CONFIG_EMBEDDED
                                        /*
index 21b7d3951387548463a4c6b08c533cb342c2ce29..33344f15e20a40008f2c740c4aef88669bcf8e71 100644 (file)
@@ -347,6 +347,11 @@ uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
 
 #endif
 
+#if __AMP__
+int vm_compressor_ebound = 1;
+int vm_pgo_pbound = 0;
+extern void thread_bind_cluster_type(char);
+#endif /* __AMP__ */
 
 
 /*
@@ -3932,7 +3937,16 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
        KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
 
        q = cq->q;
+#if __AMP__
+       if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
+               local_batch_size = (q->pgo_maxlaundry >> 3);
+               local_batch_size = MAX(local_batch_size, 16);
+       } else {
+               local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
+       }
+#else
        local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
+#endif
 
 #if RECORD_THE_COMPRESSED_DATA
        if (q->pgo_laundry) {
@@ -4317,6 +4331,11 @@ vm_pageout_iothread_internal(struct cq *cq)
        }
 
 
+#if __AMP__
+       if (vm_compressor_ebound) {
+               thread_bind_cluster_type('E');
+       }
+#endif /* __AMP__ */
 
        thread_set_thread_name(current_thread(), "VM_compressor");
 #if DEVELOPMENT || DEBUG
@@ -4723,6 +4742,12 @@ vm_pageout(void)
 
 
 
+#if __AMP__
+       PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
+       if (vm_pgo_pbound) {
+               thread_bind_cluster_type('P');
+       }
+#endif /* __AMP__ */
 
        splx(s);
 
@@ -4996,6 +5021,12 @@ vm_pageout_internal_start(void)
        PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
            sizeof(vm_pageout_state.vm_compressor_thread_count));
 
+#if     __AMP__
+       PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound));
+       if (vm_compressor_ebound) {
+               vm_pageout_state.vm_compressor_thread_count = 2;
+       }
+#endif
        if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
                vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
        }
index 34ec23be76a04290cb10264e771543a16c91a20f..b35a029e9b370b39fdab5794f7809324db4208b4 100644 (file)
@@ -309,10 +309,25 @@ static struct tbd_ops    t8011_funcs = {NULL, NULL, NULL};
 static struct tbd_ops    t8015_funcs = {NULL, NULL, NULL};
 #endif /* defined(ARM_BOARD_CLASS_T8015) */
 
+#if defined(ARM_BOARD_CLASS_T8020)
+static struct tbd_ops    t8020_funcs = {NULL, NULL, NULL};
+#endif /* defined(ARM_BOARD_CLASS_T8020) */
 
+#if defined(ARM_BOARD_CLASS_T8006)
+static struct tbd_ops    t8006_funcs = {NULL, NULL, NULL};
+#endif /* defined(ARM_BOARD_CLASS_T8006) */
 
+#if defined(ARM_BOARD_CLASS_T8027)
+static struct tbd_ops    t8027_funcs = {NULL, NULL, NULL};
+#endif /* defined(ARM_BOARD_CLASS_T8027) */
 
+#if defined(ARM_BOARD_CLASS_T8028)
+static struct tbd_ops    t8028_funcs = {NULL, NULL, NULL};
+#endif /* defined(ARM_BOARD_CLASS_T8028) */
 
+#if defined(ARM_BOARD_CLASS_T8030)
+static struct tbd_ops    t8030_funcs = {NULL, NULL, NULL};
+#endif /* defined(ARM_BOARD_CLASS_T8030) */
 
 
 
@@ -733,6 +748,31 @@ pe_arm_init_timer(void *args)
                tbd_funcs = &t8015_funcs;
        } else
 #endif
+#if defined(ARM_BOARD_CLASS_T8020)
+       if (!strcmp(gPESoCDeviceType, "t8020-io")) {
+               tbd_funcs = &t8020_funcs;
+       } else
+#endif
+#if defined(ARM_BOARD_CLASS_T8006)
+       if (!strcmp(gPESoCDeviceType, "t8006-io")) {
+               tbd_funcs = &t8006_funcs;
+       } else
+#endif
+#if defined(ARM_BOARD_CLASS_T8027)
+       if (!strcmp(gPESoCDeviceType, "t8027-io")) {
+               tbd_funcs = &t8027_funcs;
+       } else
+#endif
+#if defined(ARM_BOARD_CLASS_T8028)
+       if (!strcmp(gPESoCDeviceType, "t8028-io")) {
+               tbd_funcs = &t8028_funcs;
+       } else
+#endif
+#if defined(ARM_BOARD_CLASS_T8030)
+       if (!strcmp(gPESoCDeviceType, "t8030-io")) {
+               tbd_funcs = &t8030_funcs;
+       } else
+#endif
 #if defined(ARM_BOARD_CLASS_BCM2837)
        if (!strcmp(gPESoCDeviceType, "bcm2837-io")) {
                tbd_funcs = &bcm2837_funcs;
index 3d32aca8b6624d47d6b18f80f2c0827392961bb1..fe0b98768c7e14a37125a4c4a0bcd472cb555a9d 100644 (file)
 
 #endif /* defined (HAS_KTRR) */
 
+#if defined(HAS_CTRR)
 
+#ifdef ASSEMBLER
+#define ARM64_REG_CTRR_A_LWR_EL1 S3_4_c15_c2_3
+#define ARM64_REG_CTRR_A_UPR_EL1 S3_4_c15_c2_4
+#define ARM64_REG_CTRR_CTL_EL1   S3_4_c15_c2_5
+#define ARM64_REG_CTRR_LOCK_EL1  S3_4_c15_c2_2
+
+#define ACC_CTRR_A_LWR_EL2       S3_4_c15_c11_0
+#define ACC_CTRR_A_UPR_EL2       S3_4_c15_c11_1
+#define ACC_CTRR_CTL_EL2         S3_4_c15_c11_4
+#define ACC_CTRR_LOCK_EL2        S3_4_c15_c11_5
+#else /* ASSEMBLER */
+#define ARM64_REG_CTRR_A_LWR_EL1 "S3_4_c15_c2_3"
+#define ARM64_REG_CTRR_A_UPR_EL1 "S3_4_c15_c2_4"
+#define ARM64_REG_CTRR_CTL_EL1   "S3_4_c15_c2_5"
+#define ARM64_REG_CTRR_LOCK_EL1  "S3_4_c15_c2_2"
+
+#define ACC_CTRR_A_LWR_EL2       "S3_4_c15_c11_0"
+#define ACC_CTRR_A_UPR_EL2       "S3_4_c15_c11_1"
+#define ACC_CTRR_CTL_EL2         "S3_4_c15_c11_4"
+#define ACC_CTRR_LOCK_EL2        "S3_4_c15_c11_5"
+#endif /* ASSEMBLER */
+
+#define CTRR_CTL_EL1_A_MMUOFF_WRPROTECT  (1 << 0)
+#define CTRR_CTL_EL1_A_MMUON_WRPROTECT   (1 << 1)
+#define CTRR_CTL_EL1_B_MMUOFF_WRPROTECT  (1 << 2)
+#define CTRR_CTL_EL1_B_MMUON_WRPROTECT   (1 << 3)
+#define CTRR_CTL_EL1_A_PXN               (1 << 4)
+#define CTRR_CTL_EL1_B_PXN               (1 << 5)
+#define CTRR_CTL_EL1_A_UXN               (1 << 6)
+#define CTRR_CTL_EL1_B_UXN               (1 << 7)
+
+#endif /* defined (HAS_CTRR) */
+
+#if defined(HAS_IPI)
+
+#define ARM64_REG_IPI_RR_TYPE_IMMEDIATE (0 << 28)
+#define ARM64_REG_IPI_RR_TYPE_RETRACT   (1 << 28)
+#define ARM64_REG_IPI_RR_TYPE_DEFERRED  (2 << 28)
+#define ARM64_REG_IPI_RR_TYPE_NOWAKE    (3 << 28)
+
+#if defined(HAS_CLUSTER)
+#define ARM64_REG_IPI_RR_LOCAL          "S3_5_c15_c0_0"
+#define ARM64_REG_IPI_RR_GLOBAL         "S3_5_c15_c0_1"
+#else /* defined(HAS_CLUSTER) */
+#define ARM64_REG_IPI_RR                "S3_5_c15_c0_1"
+#endif /* defined(HAS_CLUSTER) */
+
+#define ARM64_REG_IPI_SR                "S3_5_c15_c1_1"
+#define ARM64_REG_IPI_CR                "S3_5_c15_c3_1"
+
+#endif /* defined(HAS_IPI) */
 
 
 #endif /* APPLE_ARM64_ARCH_FAMILY */
 
+#if defined(HAS_NEX_PG)
+#define ARM64_REG_HID13             S3_0_c15_c14_0
+#define ARM64_REG_HID13_RstCyc_mask (0xfULL << 60)
+#define ARM64_REG_HID13_RstCyc_val  (0xcULL << 60)
+
+#define ARM64_REG_HID14             S3_0_c15_c15_0
+#define ARM64_REG_HID14_NexPwgEn    (1ULL << 32)
+#endif /* defined(HAS_NEX_PG) */
 
+#if defined(HAS_BP_RET)
+#define ARM64_REG_ACC_CFG             S3_5_c15_c4_0
+#define ARM64_REG_ACC_CFG_bdpSlpEn    (1ULL << 2)
+#define ARM64_REG_ACC_CFG_btpSlpEn    (1ULL << 3)
+#define ARM64_REG_ACC_CFG_bpSlp_mask  3
+#define ARM64_REG_ACC_CFG_bpSlp_shift 2
+#endif /* defined(HAS_BP_RET) */
 
 #if defined(HAS_APPLE_PAC)
 
 #endif /* ASSEMBLER */
 #endif /* HAS_APPLE_PAC */
 
+#if defined(HAS_VMSA_LOCK)
+
+#define ARM64_REG_VMSA_LOCK_EL1 S3_4_c15_c1_2
+
+#define VMSA_LOCK_VBAR_EL1      (1ULL << 0)
+#define VMSA_LOCK_SCTLR_EL1     (1ULL << 1)
+#define VMSA_LOCK_TCR_EL1       (1ULL << 2)
+#define VMSA_LOCK_TTBR0_EL1     (1ULL << 3)
+#define VMSA_LOCK_TTBR1_EL1     (1ULL << 4)
+#define VMSA_LOCK_SCTLR_M_BIT   (1ULL << 63)
+
+#endif /* HAS_VMSA_LOCK */
 
 
 
index bad75685783cfbbbea7a031ee248d34e7875d796..90851847f90b8d1d1bf9a6fa1c85ab3168e08f1b 100644 (file)
 #endif
 #endif  /* ARM64_BOARD_CONFIG_T8015 */
 
+#ifdef ARM64_BOARD_CONFIG_T8020
+/*
+ * The LLC size for Vortex is 8MB, but the LLC on Tempest is only 2MB.
+ * We use the larger cache size here.  The expectation is
+ * that this may cause flushes from Tempest to be less efficient
+ * (cycles will be wasted on unnecessary way/set operations), but it
+ * will be technically correct... the best kind of correct.
+ */
+#define APPLE_ARM64_ARCH_FAMILY  1
+#define APPLEVORTEX
+#define ARM_ARCH_TIMER
+#define KERNEL_INTEGRITY_CTRR
+#include <pexpert/arm64/T8020.h>
+#define __ARM_L2CACHE_SIZE_LOG__ 23
+#define ARM_BOARD_WFE_TIMEOUT_NS 1000
+#define ARM_BOARD_CLASS_T8020
+#define CPU_COUNT 6
+#define CPU_CLUSTER_OFFSETS {0, 4}
+#define HAS_UNCORE_CTRS 1
+#define UNCORE_VERSION 2
+#define UNCORE_PER_CLUSTER 1
+#define UNCORE_NCTRS 16
+#define CORE_NCTRS 10
+#define PMAP_PV_LOAD_FACTOR 5
+#define PMAP_CS             1
+#define PMAP_CS_ENABLE      1
+#endif  /* ARM64_BOARD_CONFIG_T8020 */
 
+#ifdef ARM64_BOARD_CONFIG_T8006
+/*
+ * The T8006 consists of 2 Tempest cores (i.e. T8020 eCores) and for most
+ * of our purposes here may be considered a functional subset of T8020.
+ */
+#define APPLE_ARM64_ARCH_FAMILY  1
+#define APPLEVORTEX
+#define ARM_ARCH_TIMER
+#define KERNEL_INTEGRITY_CTRR
+#include <pexpert/arm64/T8020.h>
+#define __ARM_L2CACHE_SIZE_LOG__ 21
+#define ARM_BOARD_WFE_TIMEOUT_NS 1000
+#define ARM_BOARD_CLASS_T8006
+#define PEXPERT_NO_3X_IMAGES    1
+#define CORE_NCTRS 10
+#define PMAP_PV_LOAD_FACTOR 5
+#define PMAP_CS             1
+#define PMAP_CS_ENABLE      1
+#endif /* ARM64_BOARD_CONFIG_T8006 */
 
+#ifdef ARM64_BOARD_CONFIG_T8027
+#define APPLE_ARM64_ARCH_FAMILY  1
+#define APPLEVORTEX
+#define ARM_ARCH_TIMER
+#define KERNEL_INTEGRITY_CTRR
+#include <pexpert/arm64/T8020.h>
+#define __ARM_L2CACHE_SIZE_LOG__ 23
+#define ARM_BOARD_WFE_TIMEOUT_NS 1000
+#define ARM_BOARD_CLASS_T8027
+#define CPU_COUNT 8
+#define CPU_CLUSTER_OFFSETS {0, 4}
+#define HAS_UNCORE_CTRS 1
+#define UNCORE_VERSION 2
+#define UNCORE_PER_CLUSTER 1
+#define UNCORE_NCTRS 16
+#define CORE_NCTRS 10
+#define PMAP_PV_LOAD_FACTOR 5
+#define PMAP_CS             1
+#define PMAP_CS_ENABLE      1
+#endif  /* ARM64_BOARD_CONFIG_T8027 */
 
+#ifdef ARM64_BOARD_CONFIG_T8028
+#define APPLE_ARM64_ARCH_FAMILY  1
+#define APPLEVORTEX
+#define ARM_ARCH_TIMER
+#define KERNEL_INTEGRITY_CTRR
+#include <pexpert/arm64/T8020.h>
+#define __ARM_L2CACHE_SIZE_LOG__ 23
+#define ARM_BOARD_WFE_TIMEOUT_NS 1000
+#define ARM_BOARD_CLASS_T8028
+#define CPU_COUNT 8
+#define CPU_CLUSTER_OFFSETS {0, 4}
+#define HAS_UNCORE_CTRS 1
+#define UNCORE_VERSION 2
+#define UNCORE_PER_CLUSTER 1
+#define UNCORE_NCTRS 16
+#define CORE_NCTRS 10
+#define PMAP_PV_LOAD_FACTOR 5
+#define PMAP_CS             1
+#define PMAP_CS_ENABLE      1
+#endif  /* ARM64_BOARD_CONFIG_T8028 */
 
+#ifdef ARM64_BOARD_CONFIG_T8030
+/*
+ * The LLC size for Lightning is 8MB, but the LLC on Thunder is only 4MB.
+ * We use the larger cache size here.  The expectation is
+ * that this may cause flushes from Tempest to be less efficient
+ * (cycles will be wasted on unnecessary way/set operations), but it
+ * will be technically correct... the best kind of correct.
+ */
+#define APPLE_ARM64_ARCH_FAMILY  1
+#define APPLELIGHTNING
+#define ARM_ARCH_TIMER
+#define KERNEL_INTEGRITY_CTRR
+#include <pexpert/arm64/T8030.h>
+#define __ARM_L2CACHE_SIZE_LOG__ 23
+#define ARM_BOARD_WFE_TIMEOUT_NS 1000
+#define ARM_BOARD_CLASS_T8030
+#define CPU_COUNT 6
+#define CPU_CLUSTER_OFFSETS {0, 4}
+#define CPU_PIO_RO_CTL_OFFSETS {0x210055000, 0x210155000, 0x210255000, 0x210355000, 0x211055000, 0x211155000}
+#define CLUSTER_PIO_RO_CTL_OFFSETS {0x210e49000, 0x211e49000}
+#define HAS_UNCORE_CTRS 1
+#define UNCORE_VERSION 2
+#define UNCORE_PER_CLUSTER 1
+#define UNCORE_NCTRS 16
+#define CORE_NCTRS 10
+#define PMAP_PV_LOAD_FACTOR 7
+#define PMAP_CS             1
+#define PMAP_CS_ENABLE      1
+#endif  /* ARM64_BOARD_CONFIG_T8030 */
 
 
 
index 909a075efde4bc14f2d6b0bc4c26281aa92b4234..7fa3a8e5657c187a364b0d580e4ba400d6705c84 100644 (file)
@@ -265,7 +265,7 @@ kasan_arch_init(void)
        /* Map the physical aperture */
        kasan_map_shadow(kernel_vtop, physmap_vtop - kernel_vtop, true);
 
-#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
        /* Pre-allocate all the L3 page table pages to avoid triggering KTRR */
        kasan_map_shadow_internal(VM_MIN_KERNEL_ADDRESS, VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS + 1, false, false);
 #endif
index 3f9b671989d6caa8faf7f457ebae5fcf2cacfe11..e8c27a348179e2109b2a56ca17cecd3a5e209a47 100644 (file)
@@ -339,6 +339,7 @@ void    mac_posixshm_label_init(struct pshminfo *pshm);
 int     mac_priv_check(kauth_cred_t cred, int priv);
 int     mac_priv_grant(kauth_cred_t cred, int priv);
 int     mac_proc_check_debug(proc_t proc1, proc_t proc2);
+int     mac_proc_check_dump_core(proc_t proc);
 int     mac_proc_check_proc_info(proc_t curp, proc_t target, int callnum, int flavor);
 int     mac_proc_check_get_cs_info(proc_t curp, proc_t target, unsigned int op);
 int     mac_proc_check_set_cs_info(proc_t curp, proc_t target, unsigned int op);
index 9baaa2df9a337297221c6eb46201830c7a7d3193..1b46adf7ae247bec28de12c203e8e1968badb069 100644 (file)
@@ -2751,6 +2751,19 @@ typedef int mpo_priv_grant_t(
        kauth_cred_t cred,
        int priv
        );
+/**
+ *  @brief Access control over process core dumps
+ *  @param proc Subject process
+ *
+ *  Determine whether a core dump may be written to disk for the subject
+ *  identified.
+ *
+ *  @return Return 0 if access is granted, otherwise an appropriate value for
+ *  errno should be returned.
+ */
+typedef int mpo_proc_check_dump_core_t(
+       struct proc *proc
+       );
 /**
  *  @brief Access control check for debugging process
  *  @param cred Subject credential
@@ -6283,7 +6296,7 @@ typedef void mpo_reserved_hook_t(void);
  * Please note that this should be kept in sync with the check assumptions
  * policy in bsd/kern/policy_check.c (policy_ops struct).
  */
-#define MAC_POLICY_OPS_VERSION 58 /* inc when new reserved slots are taken */
+#define MAC_POLICY_OPS_VERSION 59 /* inc when new reserved slots are taken */
 struct mac_policy_ops {
        mpo_audit_check_postselect_t            *mpo_audit_check_postselect;
        mpo_audit_check_preselect_t             *mpo_audit_check_preselect;
@@ -6474,8 +6487,8 @@ struct mac_policy_ops {
        mpo_proc_check_setlcid_t                *mpo_proc_check_setlcid;
        mpo_proc_check_signal_t                 *mpo_proc_check_signal;
        mpo_proc_check_wait_t                   *mpo_proc_check_wait;
+       mpo_proc_check_dump_core_t              *mpo_proc_check_dump_core;
        mpo_reserved_hook_t                     *mpo_reserved5;
-       mpo_reserved_hook_t                     *mpo_reserved6;
 
        mpo_socket_check_accept_t               *mpo_socket_check_accept;
        mpo_socket_check_accepted_t             *mpo_socket_check_accepted;
index 603b7499c70bc577ad7ad9a496d3dc86d5f9e762..31d539af2a02b457c1fa3f34252a991bef32ba27 100644 (file)
@@ -327,6 +327,26 @@ mac_proc_check_debug(proc_t curp, struct proc *proc)
        return error;
 }
 
+int
+mac_proc_check_dump_core(struct proc *proc)
+{
+       int error;
+
+#if SECURITY_MAC_CHECK_ENFORCE
+       /* 21167099 - only check if we allow write */
+       if (!mac_proc_enforce) {
+               return 0;
+       }
+#endif
+       if (!mac_proc_check_enforce(proc)) {
+               return 0;
+       }
+
+       MAC_CHECK(proc_check_dump_core, proc);
+
+       return error;
+}
+
 int
 mac_proc_check_fork(proc_t curp)
 {
index c559c84d1aaf491abcfaf3c5a0468c713ded419e..610cecb15778556bdbf868eb7b65d20a20c88251 100644 (file)
@@ -38,6 +38,9 @@ CODESIGN_ALLOCATE:=$(shell xcrun -sdk "$(TARGETSDK)" -find codesign_allocate)
 
 atm_diagnostic_flag: OTHER_CFLAGS += drop_priv.c
 
+atm_diagnostic_flag_entitled: CODE_SIGN_ENTITLEMENTS = atm_diagnostic_flag.entitlements
+atm_diagnostic_flag_entitled: OTHER_CFLAGS += drop_priv.c
+
 testposixshm: INVALID_ARCHS = i386
 
 avx: INVALID_ARCHS = i386
index 0219301490b9abd67861906ef5eea9f5fbeaa966..88d314e7ac6412f9b7af894abc96621ce0415605 100644 (file)
@@ -52,9 +52,8 @@ _reset_atm_diagnostic_flag(void)
        }
 }
 
-T_DECL(toggle_atm_diagnostic_flag,
-    "change the atm_diagnostic_flag, which should use the commpage",
-    T_META_ASROOT(true))
+static void
+_toggle_atm_diagnostic_flag(void)
 {
        T_ATEND(_reset_atm_diagnostic_flag);
        uint32_t f = _save_atm_diagnostic_flag();
@@ -65,18 +64,21 @@ T_DECL(toggle_atm_diagnostic_flag,
                    "Ignoring host_set_atm_diagnostic_flag functionality. "
                    "Bailing gracefully.");
        }
-       T_EXPECT_MACH_SUCCESS(kr, "Set atm_diagnostic_flag");
+       T_EXPECT_MACH_ERROR(KERN_NO_ACCESS, kr,
+           "Deny change to atm_diagnostic_flag");
+}
+
+T_DECL(atm_diagnostic_flag_unentitled_privileged,
+    "expect to fail to set the atm_diagnostic_flag (unentitled, privileged)",
+    T_META_ASROOT(true))
+{
+       _toggle_atm_diagnostic_flag();
 }
 
-T_DECL(unprivileged_atm_diagnostic_flag,
-    "expect to fail to set the atm_diagnostic_flag",
+T_DECL(atm_diagnostic_flag_unentitled_unprivileged,
+    "expect to fail to set the atm_diagnostic_flag (unentitled, unprivileged)",
     T_META_ASROOT(false))
 {
        drop_priv();
-       T_ATEND(_reset_atm_diagnostic_flag);
-       uint32_t f = _save_atm_diagnostic_flag();
-       f ^= LIBTRACE_PRIVATE_DATA;
-       kern_return_t kr = _mutate_atm_diagnostic_flag(f);
-       T_EXPECT_MACH_ERROR(KERN_INVALID_ARGUMENT, kr,
-           "Deny change to atm_diagnostic_flag");
+       _toggle_atm_diagnostic_flag();
 }
diff --git a/tests/atm_diagnostic_flag.entitlements b/tests/atm_diagnostic_flag.entitlements
new file mode 100644 (file)
index 0000000..491a479
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.private.set-atm-diagnostic-flag</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/atm_diagnostic_flag_entitled.c b/tests/atm_diagnostic_flag_entitled.c
new file mode 100644 (file)
index 0000000..30235c3
--- /dev/null
@@ -0,0 +1,83 @@
+#include <darwintest.h>
+
+#include <mach/mach_error.h>
+#include <mach/mach_host.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging"));
+
+/*
+ * The low 8 bits may be in use, so modify one
+ * of the upper 8 bits to ensure round-tripping.
+ */
+#define LIBTRACE_PRIVATE_DATA  0x01000000
+
+extern void drop_priv(void);
+
+static bool _needs_reset;
+static uint32_t _original;
+
+static uint32_t
+_save_atm_diagnostic_flag(void)
+{
+       kern_return_t kr;
+       kr = host_get_atm_diagnostic_flag(mach_host_self(), &_original);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_get_atm_diagnostic_flag()");
+       T_LOG("Original ATM diagnostic flag: 0x%08x", _original);
+       return _original;
+}
+
+static kern_return_t
+_mutate_atm_diagnostic_flag(uint32_t v)
+{
+       T_LOG("Try to set ATM diagnostic flag to: 0x%08x", v);
+       kern_return_t kr = host_set_atm_diagnostic_flag(mach_host_self(), v);
+       if (kr == KERN_SUCCESS) {
+               _needs_reset = true;
+       }
+       return kr;
+}
+
+static void
+_reset_atm_diagnostic_flag(void)
+{
+       if (!_needs_reset) {
+               return;
+       }
+       T_LOG("Reset ATM diagnostic flag to: 0x%08x", _original);
+       kern_return_t kr;
+       kr = host_set_atm_diagnostic_flag(mach_host_self(), _original);
+       if (kr != KERN_SUCCESS) {
+               T_ASSERT_FAIL("host_set_atm_diagnostic_flag() failed: %s",
+                   mach_error_string(kr));
+       }
+}
+
+static void
+_toggle_atm_diagnostic_flag(void)
+{
+       T_ATEND(_reset_atm_diagnostic_flag);
+       uint32_t f = _save_atm_diagnostic_flag();
+       f ^= LIBTRACE_PRIVATE_DATA;
+       kern_return_t kr = _mutate_atm_diagnostic_flag(f);
+       if (kr == KERN_NOT_SUPPORTED) {
+               T_SKIP("Seems ATM is disabled on this platform. "
+                   "Ignoring host_set_atm_diagnostic_flag functionality. "
+                   "Bailing gracefully.");
+       }
+       T_EXPECT_MACH_SUCCESS(kr, "Set atm_diagnostic_flag");
+}
+
+T_DECL(atm_diagnostic_flag_entitled_privileged,
+    "change the atm_diagnostic_flag (entitled, privileged)",
+    T_META_ASROOT(true))
+{
+       _toggle_atm_diagnostic_flag();
+}
+
+T_DECL(atm_diagnostic_flag_entitled_unprivileged,
+    "change the atm_diagnostic_flag (entitled, unprivileged)",
+    T_META_ASROOT(false))
+{
+       drop_priv();
+       _toggle_atm_diagnostic_flag();
+}
diff --git a/tests/monotonic_uncore.c b/tests/monotonic_uncore.c
new file mode 100644 (file)
index 0000000..0274bbc
--- /dev/null
@@ -0,0 +1,418 @@
+/*
+ * Must come before including darwintest.h
+ */
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif /* defined(T_NAMESPACE) */
+
+#include <darwintest.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#ifndef PRIVATE
+/*
+ * Need new CPU families.
+ */
+#define PRIVATE
+#include <mach/machine.h>
+#undef PRIVATE
+#else /* !defined(PRIVATE) */
+#include <mach/machine.h>
+#endif /* defined(PRIVATE) */
+#include <stdint.h>
+#include <System/sys/guarded.h>
+#include <System/sys/monotonic.h>
+#include <sys/ioctl.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.monotonic"),
+       T_META_CHECK_LEAKS(false),
+       T_META_ENABLED(false)
+       );
+
+static bool
+device_supports_uncore(void)
+{
+       int r;
+       int type, subtype;
+       unsigned int family;
+       size_t size = sizeof(type);
+
+       /*
+        * Only arm64 Monsoon devices support uncore counters.
+        */
+
+       r = sysctlbyname("hw.cputype", &type, &size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "sysctlbyname(\"hw.cputype\")");
+       r = sysctlbyname("hw.cpusubtype", &subtype, &size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "sysctlbyname(\"hw.cpusubtype\")");
+       r = sysctlbyname("hw.cpufamily", &family, &size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "sysctlbyname(\"hw.cpufamily\")");
+
+       if (type == CPU_TYPE_ARM64 &&
+           subtype == CPU_SUBTYPE_ARM64_V8 &&
+           (family == CPUFAMILY_ARM_MONSOON_MISTRAL ||
+           family == CPUFAMILY_ARM_VORTEX_TEMPEST)) {
+               return true;
+       }
+
+       return false;
+}
+
+#define UNCORE_DEV_PATH "/dev/monotonic/uncore"
+
+static int
+open_uncore_error(int *error)
+{
+       guardid_t guard;
+       int fd;
+
+       guard = 0xa5adcafe;
+
+       T_SETUPBEGIN;
+
+       fd = guarded_open_np(UNCORE_DEV_PATH, &guard,
+           GUARD_CLOSE | GUARD_DUP | GUARD_WRITE, O_CLOEXEC | O_EXCL);
+       if (fd < 0 && errno == ENOENT) {
+               T_ASSERT_FALSE(device_supports_uncore(),
+                   "lack of dev node implies no uncore support");
+               T_SKIP("uncore counters are unsupported");
+               __builtin_unreachable();
+       }
+
+       if (error == NULL) {
+               T_ASSERT_POSIX_SUCCESS(fd, "open '%s'", UNCORE_DEV_PATH);
+       } else {
+               *error = errno;
+       }
+
+       T_SETUPEND;
+
+       return fd;
+}
+
+static void
+uncore_counts(int fd, uint64_t ctr_mask, uint64_t *counts)
+{
+       int r;
+       union monotonic_ctl_counts *cts_ctl;
+
+       cts_ctl = (union monotonic_ctl_counts *)counts;
+       cts_ctl->in.ctr_mask = ctr_mask;
+
+       r = ioctl(fd, MT_IOC_COUNTS, cts_ctl);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "MT_IOC_COUNTS got counter values");
+}
+
+#define REF_TIMEBASE_EVENT 0x3
+#define CTRS_MAX 32
+
+T_DECL(uncore_max_counters,
+    "ensure that the maximum number of uncore countes is sane",
+    T_META_ASROOT(true))
+{
+       int nctrs = 0;
+       int fd;
+
+       fd = open_uncore_error(NULL);
+
+       do {
+               union monotonic_ctl_add add_ctl;
+               int r;
+
+               add_ctl.in.config.event = REF_TIMEBASE_EVENT;
+               add_ctl.in.config.allowed_ctr_mask = UINT64_MAX;
+
+               r = ioctl(fd, MT_IOC_ADD, &add_ctl);
+               if (r < 0 && errno == E2BIG) {
+                       break;
+               }
+
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(r, "added reference timebase event to counters");
+               nctrs++;
+       } while (nctrs < CTRS_MAX);
+
+       T_EXPECT_LT(nctrs, CTRS_MAX,
+           "only able to allocate a reasonable number of counters");
+}
+
+static uint32_t
+uncore_add(int fd, uint64_t event, uint64_t allowed_ctrs, int error)
+{
+       int save_errno;
+       int r;
+       uint32_t ctr;
+       union monotonic_ctl_add add_ctl;
+
+       add_ctl.in.config.event = event;
+       add_ctl.in.config.allowed_ctr_mask = allowed_ctrs;
+       r = ioctl(fd, MT_IOC_ADD, &add_ctl);
+       if (error) {
+               save_errno = errno;
+               T_EXPECT_LT(r, 0, "adding event to counter should fail");
+               T_EXPECT_EQ(save_errno, error,
+                   "adding event to counter should fail with %d: %s",
+                   error, strerror(error));
+               return UINT32_MAX;
+       } else {
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(r,
+                   "added event %#" PRIx64 " to counters", event);
+       }
+
+       ctr = add_ctl.out.ctr;
+       T_QUIET; T_ASSERT_LT(ctr, (uint32_t)CTRS_MAX, "counter returned should be sane");
+       return ctr;
+}
+
+T_DECL(uncore_collision,
+    "ensure that trying to add an event on the same counter fails",
+    T_META_ASROOT(true))
+{
+       int fd;
+       uint32_t ctr;
+
+       fd = open_uncore_error(NULL);
+
+       ctr = uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_MAX, 0);
+       T_LOG("added event to uncore counter %d\n", ctr);
+
+       (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_C(1) << ctr, ENOSPC);
+}
+
+static void
+uncore_enable(int fd)
+{
+       union monotonic_ctl_enable en_ctl = {
+               .in = { .enable = true }
+       };
+
+       T_ASSERT_POSIX_SUCCESS(ioctl(fd, MT_IOC_ENABLE, &en_ctl),
+           "enabling counters");
+}
+
+T_DECL(uncore_enabled_busy,
+    "ensure that trying to add an event while enabled fails",
+    T_META_ASROOT(true))
+{
+       int fd;
+
+       fd = open_uncore_error(NULL);
+
+       (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_MAX, 0);
+
+       uncore_enable(fd);
+       (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_MAX, EBUSY);
+}
+
+T_DECL(uncore_reset,
+    "ensure that resetting the counters works")
+{
+       int fd;
+       int r;
+
+       fd = open_uncore_error(NULL);
+
+       (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_C(1), 0);
+       (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_C(1), ENOSPC);
+
+       r = ioctl(fd, MT_IOC_RESET);
+       T_ASSERT_POSIX_SUCCESS(r, "resetting succeeds");
+
+       T_LOG("adding event to same counter after reset");
+       (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_C(1), 0);
+}
+
+#define SLEEP_USECS (500 * 1000)
+
+static int
+uncore_add_all(int fd, uint64_t event, int *nmonitors)
+{
+       int nctrs = 0;
+       int r;
+
+       do {
+               union monotonic_ctl_add add_ctl;
+
+               add_ctl.in.config.event = event;
+               add_ctl.in.config.allowed_ctr_mask = UINT64_MAX;
+
+               r = ioctl(fd, MT_IOC_ADD, &add_ctl);
+               if (r < 0 && errno == E2BIG) {
+                       break;
+               }
+
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(r, "added event %#" PRIx64 " to counters",
+                   event);
+               nctrs++;
+       } while (nctrs < CTRS_MAX);
+
+       if (nmonitors) {
+               union monotonic_ctl_info info_ctl;
+               r = ioctl(fd, MT_IOC_GET_INFO, &info_ctl);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "got info about uncore counters");
+
+               *nmonitors = (int)info_ctl.out.nmonitors;
+       }
+
+       return nctrs;
+}
+
+T_DECL(uncore_accuracy,
+    "ensure that the uncore counters count accurately",
+    T_META_ASROOT(true))
+{
+       int fd;
+       int nctrs = 0;
+       int nmonitors = 0;
+       uint64_t ctr_mask;
+       uint64_t counts[2][CTRS_MAX];
+       uint64_t times[2];
+
+       fd = open_uncore_error(NULL);
+
+       /*
+        * The reference timebase event counts the same as mach_continuous_time
+        * (on hardware supporting uncore counters).  Make sure that the counter
+        * is close to the values returned from the trap.
+        *
+        * Fill all the counters with this event.
+        */
+       nctrs = uncore_add_all(fd, REF_TIMEBASE_EVENT, &nmonitors);
+       ctr_mask = (UINT64_C(1) << nctrs) - 1;
+
+       T_LOG("added %d counters to check", nctrs);
+
+       uncore_enable(fd);
+
+       /*
+        * First, make sure there's an upper bound on the counter -- take the
+        * time around getting the counter values.
+        */
+
+       times[0] = mach_absolute_time();
+       uncore_counts(fd, ctr_mask, counts[0]);
+
+       usleep(SLEEP_USECS);
+
+       uncore_counts(fd, ctr_mask, counts[1]);
+       times[1] = mach_absolute_time();
+
+       T_QUIET; T_EXPECT_GT(times[1], times[0],
+           "mach_continuous_time is monotonically increasing");
+       for (int i = 0; i < nctrs; i++) {
+               T_EXPECT_GT(counts[1][i], counts[0][i],
+                   "uncore counter %d value is monotonically increasing", i);
+               T_EXPECT_LT(counts[1][i] - counts[0][i], times[1] - times[0],
+                   "reference timebase on uncore counter %d satisfies upper bound "
+                   "from mach_absolute_time", i);
+       }
+
+       /*
+        * Next, the lower bound -- put mach_absolute_time inside getting the
+        * counter values.
+        */
+
+       uncore_counts(fd, ctr_mask, counts[0]);
+       times[0] = mach_absolute_time();
+
+       volatile int iterations = 100000;
+       while (iterations--) {
+               ;
+       }
+
+       times[1] = mach_absolute_time();
+       uncore_counts(fd, ctr_mask, counts[1]);
+
+       for (int mon = 0; mon < nmonitors; mon++) {
+               for (int i = 0; i < nctrs; i++) {
+                       T_QUIET;
+                       T_EXPECT_GT(counts[1][i * mon], counts[0][i * mon],
+                           "uncore %d counter %d value is monotonically increasing",
+                           mon, i);
+                       T_EXPECT_GT(counts[1][i * mon] - counts[0][i * mon],
+                           times[1] - times[0],
+                           "reference timebase on uncore %d counter %d satisfies "
+                           "lower bound from mach_absolute_time", mon, i);
+               }
+       }
+}
+
+T_DECL(uncore_ownership,
+    "ensure the dev node cannot be open in two places",
+    T_META_ASROOT(true))
+{
+       int fd;
+       int other_fd;
+       int error;
+
+       fd = open_uncore_error(NULL);
+
+       other_fd = open_uncore_error(&error);
+       T_ASSERT_LT(other_fd, 0, "opening a second uncore fd should fail");
+       T_ASSERT_EQ(error, EBUSY, "failure should be EBUSY");
+}
+
+T_DECL(uncore_root_required,
+    "ensure the dev node cannot be opened by non-root users",
+    T_META_ASROOT(false))
+{
+       int fd;
+       int error = 0;
+
+       T_SKIP("libdarwintest doesn't drop privileges properly");
+
+       fd = open_uncore_error(&error);
+       T_ASSERT_LT(fd, 0, "opening dev node should not return an fd");
+       T_ASSERT_EQ(error, EPERM,
+           "opening dev node as non-root user should fail with EPERM");
+}
+
+T_DECL(perf_uncore,
+    "measure the latency of accessing the counters",
+    T_META_TAG_PERF)
+{
+       int fd;
+       int nctrs;
+       int nmonitors;
+       int r;
+       uint64_t ctr_mask;
+       dt_stat_thread_instructions_t counts_instrs;
+       dt_stat_t counter_deltas;
+
+       counts_instrs = dt_stat_thread_instructions_create("ioctl_counts");
+       counter_deltas = dt_stat_create("abs_time", "between_each_counter");
+
+       fd = open_uncore_error(NULL);
+
+       nctrs = uncore_add_all(fd, REF_TIMEBASE_EVENT, &nmonitors);
+       ctr_mask = (UINT64_C(1) << nctrs) - 1;
+
+       uncore_enable(fd);
+
+       do {
+               dt_stat_token token;
+               uint64_t counts[nctrs * nmonitors];
+               union monotonic_ctl_counts *cts_ctl;
+
+               cts_ctl = (union monotonic_ctl_counts *)counts;
+               cts_ctl->in.ctr_mask = ctr_mask;
+
+               token = dt_stat_thread_instructions_begin(counts_instrs);
+               r = ioctl(fd, MT_IOC_COUNTS, cts_ctl);
+               dt_stat_thread_instructions_end(counts_instrs, token);
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(r,
+                   "getting uncore counter values %#" PRIx64, ctr_mask);
+
+               for (int i = 0; i < (nctrs - 1); i++) {
+                       dt_stat_add(counter_deltas, (double)(counts[i + 1] - counts[i]));
+               }
+       } while (!dt_stat_stable(counts_instrs) || !dt_stat_stable(counter_deltas));
+
+       dt_stat_finalize(counts_instrs);
+       dt_stat_finalize(counter_deltas);
+}