]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/i386/pmCPU.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / i386 / pmCPU.c
index 63108e188c57638969d83b5b5da9655eff570c57..038ae1313cdedd19d329b15af6f78c9da308f31f 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2004-2011 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
  *
  * Implements the "wrappers" to the KEXT.
  */
-#include <i386/machine_routines.h>
-#include <i386/machine_cpu.h>
-#include <i386/misc_protos.h>
-#include <i386/pmap.h>
 #include <i386/asm.h>
+#include <i386/machine_cpu.h>
 #include <i386/mp.h>
+#include <i386/machine_routines.h>
 #include <i386/proc_reg.h>
+#include <i386/pmap.h>
+#include <i386/misc_protos.h>
+#include <kern/machine.h>
 #include <kern/pms.h>
 #include <kern/processor.h>
+#include <kern/timer_queue.h>
+#include <i386/cpu_threads.h>
 #include <i386/pmCPU.h>
 #include <i386/cpuid.h>
-#include <i386/rtclock.h>
-#if MACH_KDB
-#include <i386/db_machdep.h>
-#include <ddb/db_aout.h>
-#include <ddb/db_access.h>
-#include <ddb/db_sym.h>
-#include <ddb/db_variables.h>
-#include <ddb/db_command.h>
-#include <ddb/db_output.h>
-#include <ddb/db_expr.h>
-#endif
+#include <i386/rtclock_protos.h>
+#include <kern/sched_prim.h>
+#include <i386/lapic.h>
+#include <i386/pal_routines.h>
+#include <sys/kdebug.h>
+#include <i386/tsc.h>
+
+#include <kern/sched_urgency.h>
 
 extern int disableConsoleOutput;
 
-decl_simple_lock_data(,pm_init_lock);
+#define DELAY_UNSET             0xFFFFFFFFFFFFFFFFULL
+
+uint64_t cpu_itime_bins[CPU_ITIME_BINS] = {16 * NSEC_PER_USEC, 32 * NSEC_PER_USEC, 64 * NSEC_PER_USEC, 128 * NSEC_PER_USEC, 256 * NSEC_PER_USEC, 512 * NSEC_PER_USEC, 1024 * NSEC_PER_USEC, 2048 * NSEC_PER_USEC, 4096 * NSEC_PER_USEC, 8192 * NSEC_PER_USEC, 16384 * NSEC_PER_USEC, 32768 * NSEC_PER_USEC};
+uint64_t *cpu_rtime_bins = &cpu_itime_bins[0];
 
 /*
  * The following is set when the KEXT loads and initializes.
  */
-pmDispatch_t   *pmDispatch     = NULL;
+pmDispatch_t    *pmDispatch     = NULL;
 
-/*
- * Current power management states (for use until KEXT is loaded).
- */
-static pmInitState_t   pmInitState;
+uint32_t                pmInitDone              = 0;
+static boolean_t        earlyTopology           = FALSE;
+static uint64_t         earlyMaxBusDelay        = DELAY_UNSET;
+static uint64_t         earlyMaxIntDelay        = DELAY_UNSET;
 
 /*
- * Nap control variables:
+ * Initialize the Cstate change code.
  */
-uint32_t napCtl = 0;                   /* Defaults to neither napping
-                                          nor halting */
-uint32_t forcenap = 0;                 /* Force nap (fn) boot-arg controls */
-uint32_t maxBusDelay = 0xFFFFFFFF;     /* Maximum memory bus delay that
-                                          I/O devices can tolerate
-                                          before errors (nanoseconds) */
-uint32_t C4C2SnoopDelay = 0;           /* C4 to C2 transition time -
-                                          time before a C4 system
-                                          can snoop (nanoseconds) */
+void
+power_management_init(void)
+{
+       if (pmDispatch != NULL && pmDispatch->cstateInit != NULL) {
+               (*pmDispatch->cstateInit)();
+       }
+}
+
+static inline void
+machine_classify_interval(uint64_t interval, uint64_t *bins, uint64_t *binvals, uint32_t nbins)
+{
+       uint32_t i;
+       for (i = 0; i < nbins; i++) {
+               if (interval < binvals[i]) {
+                       bins[i]++;
+                       break;
+               }
+       }
+}
+
+uint64_t        idle_pending_timers_processed;
+uint32_t        idle_entry_timer_processing_hdeadline_threshold = 5000000;
 
 /*
- * We are being asked to set PState (sel).
+ * Called when the CPU is idle.  It calls into the power management kext
+ * to determine the best way to idle the CPU.
  */
 void
-pmsCPUSet(uint32_t sel)
+machine_idle(void)
 {
-    if (pmDispatch != NULL && pmDispatch->pmsCPUSet != NULL)
-       (*pmDispatch->pmsCPUSet)(sel);
-    else
-       pmInitState.PState = sel;
+       cpu_data_t              *my_cpu         = current_cpu_datap();
+       __unused uint32_t       cnum = my_cpu->cpu_number;
+       uint64_t                ctime, rtime, itime;
+#if CST_DEMOTION_DEBUG
+       processor_t             cproc = my_cpu->cpu_processor;
+       uint64_t                cwakeups = my_cpu->cpu_wakeups_issued_total;
+#endif /* CST_DEMOTION_DEBUG */
+       uint64_t esdeadline, ehdeadline;
+       boolean_t do_process_pending_timers = FALSE;
+
+       ctime = mach_absolute_time();
+       esdeadline = my_cpu->rtclock_timer.queue.earliest_soft_deadline;
+       ehdeadline = my_cpu->rtclock_timer.deadline;
+/* Determine if pending timers exist */
+       if ((ctime >= esdeadline) && (ctime < ehdeadline) &&
+           ((ehdeadline - ctime) < idle_entry_timer_processing_hdeadline_threshold)) {
+               idle_pending_timers_processed++;
+               do_process_pending_timers = TRUE;
+               goto machine_idle_exit;
+       } else {
+               TCOAL_DEBUG(0xCCCC0000, ctime, my_cpu->rtclock_timer.queue.earliest_soft_deadline, my_cpu->rtclock_timer.deadline, idle_pending_timers_processed, 0);
+       }
+
+       my_cpu->lcpu.state = LCPU_IDLE;
+       DBGLOG(cpu_handle, cpu_number(), MP_IDLE);
+       MARK_CPU_IDLE(cnum);
+
+       rtime = ctime - my_cpu->cpu_ixtime;
+
+       my_cpu->cpu_rtime_total += rtime;
+       machine_classify_interval(rtime, &my_cpu->cpu_rtimes[0], &cpu_rtime_bins[0], CPU_RTIME_BINS);
+#if CST_DEMOTION_DEBUG
+       uint32_t cl = 0, ch = 0;
+       uint64_t c3res, c6res, c7res;
+       rdmsr_carefully(MSR_IA32_CORE_C3_RESIDENCY, &cl, &ch);
+       c3res = ((uint64_t)ch << 32) | cl;
+       rdmsr_carefully(MSR_IA32_CORE_C6_RESIDENCY, &cl, &ch);
+       c6res = ((uint64_t)ch << 32) | cl;
+       rdmsr_carefully(MSR_IA32_CORE_C7_RESIDENCY, &cl, &ch);
+       c7res = ((uint64_t)ch << 32) | cl;
+#endif
+
+       if (pmInitDone) {
+               /*
+                * Handle case where ml_set_maxbusdelay() or ml_set_maxintdelay()
+                * were called prior to the CPU PM kext being registered.  We do
+                * this here since we know at this point the values will be first
+                * used since idle is where the decisions using these values is made.
+                */
+               if (earlyMaxBusDelay != DELAY_UNSET) {
+                       ml_set_maxbusdelay((uint32_t)(earlyMaxBusDelay & 0xFFFFFFFF));
+               }
+               if (earlyMaxIntDelay != DELAY_UNSET) {
+                       ml_set_maxintdelay(earlyMaxIntDelay);
+               }
+       }
+
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->MachineIdle != NULL) {
+               (*pmDispatch->MachineIdle)(0x7FFFFFFFFFFFFFFFULL);
+       } else {
+               /*
+                * If no power management, re-enable interrupts and halt.
+                * This will keep the CPU from spinning through the scheduler
+                * and will allow at least some minimal power savings (but it
+                * cause problems in some MP configurations w.r.t. the APIC
+                * stopping during a GV3 transition).
+                */
+               pal_hlt();
+               /* Once woken, re-disable interrupts. */
+               pal_cli();
+       }
+
+       /*
+        * Mark the CPU as running again.
+        */
+       MARK_CPU_ACTIVE(cnum);
+       DBGLOG(cpu_handle, cnum, MP_UNIDLE);
+       my_cpu->lcpu.state = LCPU_RUN;
+       uint64_t ixtime = my_cpu->cpu_ixtime = mach_absolute_time();
+       itime = ixtime - ctime;
+       my_cpu->cpu_idle_exits++;
+       my_cpu->cpu_itime_total += itime;
+       machine_classify_interval(itime, &my_cpu->cpu_itimes[0], &cpu_itime_bins[0], CPU_ITIME_BINS);
+#if CST_DEMOTION_DEBUG
+       cl = ch = 0;
+       rdmsr_carefully(MSR_IA32_CORE_C3_RESIDENCY, &cl, &ch);
+       c3res = (((uint64_t)ch << 32) | cl) - c3res;
+       rdmsr_carefully(MSR_IA32_CORE_C6_RESIDENCY, &cl, &ch);
+       c6res = (((uint64_t)ch << 32) | cl) - c6res;
+       rdmsr_carefully(MSR_IA32_CORE_C7_RESIDENCY, &cl, &ch);
+       c7res = (((uint64_t)ch << 32) | cl) - c7res;
+
+       uint64_t ndelta = itime - tmrCvt(c3res + c6res + c7res, tscFCvtt2n);
+       KERNEL_DEBUG_CONSTANT(0xcead0000, ndelta, itime, c7res, c6res, c3res);
+       if ((itime > 1000000) && (ndelta > 250000)) {
+               KERNEL_DEBUG_CONSTANT(0xceae0000, ndelta, itime, c7res, c6res, c3res);
+       }
+#endif
+
+machine_idle_exit:
+       /*
+        * Re-enable interrupts.
+        */
+
+       pal_sti();
+
+       if (do_process_pending_timers) {
+               TCOAL_DEBUG(0xBBBB0000 | DBG_FUNC_START, ctime, esdeadline, ehdeadline, idle_pending_timers_processed, 0);
+
+               /* Adjust to reflect that this isn't truly a package idle exit */
+               __sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1);
+               lapic_timer_swi(); /* Trigger software timer interrupt */
+               __sync_fetch_and_add(&my_cpu->lcpu.package->num_idle, 1);
+
+               TCOAL_DEBUG(0xBBBB0000 | DBG_FUNC_END, ctime, esdeadline, idle_pending_timers_processed, 0, 0);
+       }
+#if CST_DEMOTION_DEBUG
+       uint64_t nwakeups = my_cpu->cpu_wakeups_issued_total;
+
+       if ((nwakeups == cwakeups) && (topoParms.nLThreadsPerPackage == my_cpu->lcpu.package->num_idle)) {
+               KERNEL_DEBUG_CONSTANT(0xceaa0000, cwakeups, 0, 0, 0, 0);
+       }
+#endif
 }
 
 /*
- * This code configures the initial step tables.  It should be called after
- * the timebase frequency is initialized.
- *
- * Note that this is not used in normal operation.  It is strictly for
- * debugging/testing purposes.
+ * Called when the CPU is to be halted.  It will choose the best C-State
+ * to be in.
  */
 void
-pmsCPUConf(void)
+pmCPUHalt(uint32_t reason)
 {
+       cpu_data_t  *cpup   = current_cpu_datap();
+
+       switch (reason) {
+       case PM_HALT_DEBUG:
+               cpup->lcpu.state = LCPU_PAUSE;
+               pal_stop_cpu(FALSE);
+               break;
+
+       case PM_HALT_PANIC:
+               cpup->lcpu.state = LCPU_PAUSE;
+               pal_stop_cpu(TRUE);
+               break;
+
+       case PM_HALT_NORMAL:
+       case PM_HALT_SLEEP:
+       default:
+               pal_cli();
+
+               if (pmInitDone
+                   && pmDispatch != NULL
+                   && pmDispatch->pmCPUHalt != NULL) {
+                       /*
+                        * Halt the CPU (and put it in a low power state.
+                        */
+                       (*pmDispatch->pmCPUHalt)();
 
-    if (pmDispatch != NULL && pmDispatch->pmsCPUConf != NULL)
-       (*pmDispatch->pmsCPUConf)();
+                       /*
+                        * We've exited halt, so get the CPU schedulable again.
+                        * - by calling the fast init routine for a slave, or
+                        * - by returning if we're the master processor.
+                        */
+                       if (cpup->cpu_number != master_cpu) {
+                               i386_init_slave_fast();
+                               panic("init_slave_fast returned");
+                       }
+               } else {
+                       /*
+                        * If no power managment and a processor is taken off-line,
+                        * then invalidate the cache and halt it (it will not be able
+                        * to be brought back on-line without resetting the CPU).
+                        */
+                       __asm__ volatile ("wbinvd");
+                       cpup->lcpu.state = LCPU_HALT;
+                       pal_stop_cpu(FALSE);
+
+                       panic("back from Halt");
+               }
+
+               break;
+       }
+}
+
+void
+pmMarkAllCPUsOff(void)
+{
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->markAllCPUsOff != NULL) {
+               (*pmDispatch->markAllCPUsOff)();
+       }
+}
+
+static void
+pmInitComplete(void)
+{
+       if (earlyTopology
+           && pmDispatch != NULL
+           && pmDispatch->pmCPUStateInit != NULL) {
+               (*pmDispatch->pmCPUStateInit)();
+               earlyTopology = FALSE;
+       }
+       pmInitDone = 1;
+}
+
+x86_lcpu_t *
+pmGetLogicalCPU(int cpu)
+{
+       return cpu_to_lcpu(cpu);
+}
+
+x86_lcpu_t *
+pmGetMyLogicalCPU(void)
+{
+       cpu_data_t  *cpup   = current_cpu_datap();
+
+       return &cpup->lcpu;
+}
+
+static x86_core_t *
+pmGetCore(int cpu)
+{
+       return cpu_to_core(cpu);
+}
+
+static x86_core_t *
+pmGetMyCore(void)
+{
+       cpu_data_t  *cpup   = current_cpu_datap();
+
+       return cpup->lcpu.core;
+}
+
+static x86_die_t *
+pmGetDie(int cpu)
+{
+       return cpu_to_die(cpu);
+}
+
+static x86_die_t *
+pmGetMyDie(void)
+{
+       cpu_data_t  *cpup   = current_cpu_datap();
+
+       return cpup->lcpu.die;
+}
+
+static x86_pkg_t *
+pmGetPackage(int cpu)
+{
+       return cpu_to_package(cpu);
+}
+
+static x86_pkg_t *
+pmGetMyPackage(void)
+{
+       cpu_data_t  *cpup   = current_cpu_datap();
+
+       return cpup->lcpu.package;
+}
+
+static void
+pmLockCPUTopology(int lock)
+{
+       if (lock) {
+               mp_safe_spin_lock(&x86_topo_lock);
+       } else {
+               simple_unlock(&x86_topo_lock);
+       }
 }
 
 /*
- * Machine-dependent initialization.
+ * Called to get the next deadline that has been set by the
+ * power management code.
+ * Note: a return of 0 from AICPM and this routine signifies
+ * that no deadline is set.
  */
-void
-pmsCPUMachineInit(void)
+uint64_t
+pmCPUGetDeadline(cpu_data_t *cpu)
 {
-    /*
-     * Initialize some of the initial state to "uninitialized" until
-     * it gets set with something more useful.  This allows the KEXT
-     * to determine if the initial value was actually set to something.
-     */
-    pmInitState.PState = -1;
-    pmInitState.PLimit = -1;
+       uint64_t    deadline        = 0;
+
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->GetDeadline != NULL) {
+               deadline = (*pmDispatch->GetDeadline)(&cpu->lcpu);
+       }
 
-    if (pmDispatch != NULL && pmDispatch->pmsCPUMachineInit != NULL)
-       (*pmDispatch->pmsCPUMachineInit)();
+       return deadline;
 }
 
 /*
- * This function should be called once for each processor to force the
- * processor to the correct initial voltage and frequency.
+ * Called to determine if the supplied deadline or the power management
+ * deadline is sooner.  Returns which ever one is first.
  */
-void
-pmsCPUInit(void)
+
+uint64_t
+pmCPUSetDeadline(cpu_data_t *cpu, uint64_t deadline)
 {
-    pmsCPUMachineInit();
-    if (pmDispatch != NULL && pmDispatch->pmsCPUInit != NULL)
-       (*pmDispatch->pmsCPUInit)();
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->SetDeadline != NULL) {
+               deadline = (*pmDispatch->SetDeadline)(&cpu->lcpu, deadline);
+       }
+
+       return deadline;
 }
 
 /*
- * Broadcast a change to all processing including ourselves.
+ * Called when a power management deadline expires.
  */
 void
-pmsCPURun(uint32_t nstep)
+pmCPUDeadline(cpu_data_t *cpu)
 {
-    if (pmDispatch != NULL && pmDispatch->pmsCPURun != NULL)
-       (*pmDispatch->pmsCPURun)(nstep);
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->Deadline != NULL) {
+               (*pmDispatch->Deadline)(&cpu->lcpu);
+       }
 }
 
 /*
- * Return the current state of a core.
+ * Called to get a CPU out of idle.
  */
-uint32_t
-pmsCPUQuery(void)
+boolean_t
+pmCPUExitIdle(cpu_data_t *cpu)
 {
-    if (pmDispatch != NULL && pmDispatch->pmsCPUQuery != NULL)
-       return((*pmDispatch->pmsCPUQuery)());
+       boolean_t           do_ipi;
+
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->exitIdle != NULL) {
+               do_ipi = (*pmDispatch->exitIdle)(&cpu->lcpu);
+       } else {
+               do_ipi = TRUE;
+       }
 
-    /*
-     * Return a non-sense value.
-     */
-    return((~0) << 16);
+       return do_ipi;
 }
 
-/*
- * Return the current state of the package.
- */
-uint32_t
-pmsCPUPackageQuery(void)
+kern_return_t
+pmCPUExitHalt(int cpu)
+{
+       kern_return_t       rc      = KERN_INVALID_ARGUMENT;
+
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->exitHalt != NULL) {
+               rc = pmDispatch->exitHalt(cpu_to_lcpu(cpu));
+       }
+
+       return rc;
+}
+
+kern_return_t
+pmCPUExitHaltToOff(int cpu)
 {
-    if (pmDispatch != NULL && pmDispatch->pmsCPUPackageQuery != NULL)
-       return((*pmDispatch->pmsCPUPackageQuery)());
+       kern_return_t       rc      = KERN_SUCCESS;
+
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->exitHaltToOff != NULL) {
+               rc = pmDispatch->exitHaltToOff(cpu_to_lcpu(cpu));
+       }
 
-    /*
-     * Return a non-sense value.
-     */
-    return((~0) << 16);
+       return rc;
 }
 
 /*
- * Force the CPU package to the lowest power level.  This is a low-level
- * interface meant to be called from the panic or debugger code to bring
- * the CPU to a safe power level for unmanaged operation.
- *
- * Note that while this will bring an entire package to a safe level, it
- * cannot affect other packages.  As a general rule, this should be run on
- * every code as part of entering the debugger or on the panic path.
+ * Called to initialize the power management structures for the CPUs.
  */
 void
-pmsCPUYellowFlag(void)
+pmCPUStateInit(void)
 {
-    if (pmDispatch != NULL && pmDispatch->pmsCPUYellowFlag != NULL)
-       (*pmDispatch->pmsCPUYellowFlag)();
+       if (pmDispatch != NULL && pmDispatch->pmCPUStateInit != NULL) {
+               (*pmDispatch->pmCPUStateInit)();
+       } else {
+               earlyTopology = TRUE;
+       }
 }
 
 /*
- * Restore the CPU to the power state it was in before a yellow flag.
+ * Called when a CPU is being restarted after being powered off (as in S3).
  */
 void
-pmsCPUGreenFlag(void)
+pmCPUMarkRunning(cpu_data_t *cpu)
 {
-    if (pmDispatch != NULL && pmDispatch->pmsCPUGreenFlag != NULL)
-       (*pmDispatch->pmsCPUGreenFlag)();
+       cpu_data_t  *cpup   = current_cpu_datap();
+
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->markCPURunning != NULL) {
+               (*pmDispatch->markCPURunning)(&cpu->lcpu);
+       } else {
+               cpup->lcpu.state = LCPU_RUN;
+       }
 }
 
 /*
- * Load a new ratio/VID table.
- *
- * Note that this interface is specific to the Intel SpeedStep implementation.
- * It is expected that this will only be called once to override the default
- * ratio/VID table when the platform starts.
- *
- * Normally, the table will need to be replaced at the same time that the
- * stepper program proper is replaced, as the PState indices from an old
- * program may no longer be valid.  When replacing the default program this
- * should not be a problem as any new table will have at least two PState
- * entries and the default program only references P0 and P1.
+ * Called to get/set CPU power management state.
  */
-kern_return_t
-pmsCPULoadVIDTable(uint16_t *tablep, int nstates)
+int
+pmCPUControl(uint32_t cmd, void *datap)
 {
-    if (pmDispatch != NULL && pmDispatch->pmsCPULoadVIDTable != NULL)
-       return((*pmDispatch->pmsCPULoadVIDTable)(tablep, nstates));
-    else {
-       int     i;
+       int         rc      = -1;
 
-       if (nstates > MAX_PSTATES)
-           return(KERN_FAILURE);
+       if (pmDispatch != NULL
+           && pmDispatch->pmCPUControl != NULL) {
+               rc = (*pmDispatch->pmCPUControl)(cmd, datap);
+       }
 
-       for (i = 0; i < nstates; i += 1)
-           pmInitState.VIDTable[i] = tablep[i];
-    }
-    return(KERN_SUCCESS);
+       return rc;
 }
 
 /*
- * Set the (global) PState limit.  CPUs will not be permitted to run at
- * a lower (more performant) PState than this.
+ * Called to save the timer state used by power management prior
+ * to "sleeping".
  */
-kern_return_t
-pmsCPUSetPStateLimit(uint32_t limit)
+void
+pmTimerSave(void)
 {
-    if (pmDispatch != NULL && pmDispatch->pmsCPUSetPStateLimit != NULL)
-       return((*pmDispatch->pmsCPUSetPStateLimit)(limit));
+       if (pmDispatch != NULL
+           && pmDispatch->pmTimerStateSave != NULL) {
+               (*pmDispatch->pmTimerStateSave)();
+       }
+}
 
-    pmInitState.PLimit = limit;
-    return(KERN_SUCCESS);
+/*
+ * Called to restore the timer state used by power management after
+ * waking from "sleep".
+ */
+void
+pmTimerRestore(void)
+{
+       if (pmDispatch != NULL
+           && pmDispatch->pmTimerStateRestore != NULL) {
+               (*pmDispatch->pmTimerStateRestore)();
+       }
 }
 
 /*
- * Initialize the Cstate change code.
+ * Set the worst-case time for the C4 to C2 transition.
+ * No longer does anything.
  */
 void
-power_management_init(void)
+ml_set_maxsnoop(__unused uint32_t maxdelay)
 {
-    uint32_t   cpuModel;
-    uint32_t   cpuFamily;
-    uint32_t   xcpuid[4];
+}
 
-    /*
-     * Initialize the lock for the KEXT initialization.
-     */
-    simple_lock_init(&pm_init_lock, 0);
 
-    /*
-     * XXX
-     *
-     * The following is a hack to disable power management on some systems
-     * until the KEXT is done.  This is strictly temporary!!!
-     */
-    do_cpuid(1, xcpuid);
-    cpuFamily = (xcpuid[eax] >> 8) & 0xf;
-    cpuModel  = (xcpuid[eax] >> 4) & 0xf;
+/*
+ * Get the worst-case time for the C4 to C2 transition.  Returns nanoseconds.
+ */
+unsigned
+ml_get_maxsnoop(void)
+{
+       uint64_t    max_snoop       = 0;
 
-    if (cpuFamily != 0x6 || cpuModel < 0xe)
-       pmDispatch = NULL;
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->getMaxSnoop != NULL) {
+               max_snoop = pmDispatch->getMaxSnoop();
+       }
 
-    if (pmDispatch != NULL && pmDispatch->cstateInit != NULL)
-       (*pmDispatch->cstateInit)();
+       return (unsigned)(max_snoop & 0xffffffff);
+}
+
+
+uint32_t
+ml_get_maxbusdelay(void)
+{
+       uint64_t    max_delay       = 0;
+
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->getMaxBusDelay != NULL) {
+               max_delay = pmDispatch->getMaxBusDelay();
+       }
+
+       return (uint32_t)(max_delay & 0xffffffff);
 }
 
 /*
- * This function will update the system nap policy.  It should be called
- * whenever conditions change: when the system is ready to being napping
- * and if something changes the rules (e.g. a sysctl altering the policy
- * for debugging).
+ * Advertise a memory access latency tolerance of "mdelay" ns
  */
 void
-machine_nap_policy(void)
+ml_set_maxbusdelay(uint32_t mdelay)
 {
-    if (pmDispatch != NULL && pmDispatch->cstateNapPolicy != NULL)
-       napCtl = (*pmDispatch->cstateNapPolicy)(forcenap, napCtl);
+       uint64_t    maxdelay        = mdelay;
+
+       if (pmDispatch != NULL
+           && pmDispatch->setMaxBusDelay != NULL) {
+               earlyMaxBusDelay = DELAY_UNSET;
+               pmDispatch->setMaxBusDelay(maxdelay);
+       } else {
+               earlyMaxBusDelay = maxdelay;
+       }
+}
+
+uint64_t
+ml_get_maxintdelay(void)
+{
+       uint64_t    max_delay       = 0;
+
+       if (pmDispatch != NULL
+           && pmDispatch->getMaxIntDelay != NULL) {
+               max_delay = pmDispatch->getMaxIntDelay();
+       }
+
+       return max_delay;
 }
 
 /*
- * ACPI calls the following routine to set/update mwait hints.  A table
- * (possibly null) specifies the available Cstates and their hints, all
- * other states are assumed to be invalid.  ACPI may update available
- * states to change the nap policy (for example, while AC power is
- * available).
+ * Set the maximum delay allowed for an interrupt.
  */
-kern_return_t
-Cstate_table_set(Cstate_hint_t *tablep, unsigned int nstates)
+void
+ml_set_maxintdelay(uint64_t mdelay)
 {
-    if (forcenap)
-       return(KERN_SUCCESS);
+       if (pmDispatch != NULL
+           && pmDispatch->setMaxIntDelay != NULL) {
+               earlyMaxIntDelay = DELAY_UNSET;
+               pmDispatch->setMaxIntDelay(mdelay);
+       } else {
+               earlyMaxIntDelay = mdelay;
+       }
+}
 
-    if (pmDispatch != NULL && pmDispatch->cstateTableSet != NULL)
-       return((*pmDispatch->cstateTableSet)(tablep, nstates));
-    else {
-       unsigned int    i;
+boolean_t
+ml_get_interrupt_prewake_applicable()
+{
+       boolean_t applicable = FALSE;
 
-       for (i = 0; i < nstates; i += 1) {
-           pmInitState.CStates[i].number = tablep[i].number;
-           pmInitState.CStates[i].hint   = tablep[i].hint;
+       if (pmInitDone
+           && pmDispatch != NULL
+           && pmDispatch->pmInterruptPrewakeApplicable != NULL) {
+               applicable = pmDispatch->pmInterruptPrewakeApplicable();
        }
 
-       pmInitState.CStatesCount = nstates;
-    }
-    return(KERN_SUCCESS);
-}
-
-static inline void
-sti(void) {
-       __asm__ volatile ( "sti" : : : "memory");
+       return applicable;
 }
 
 /*
- * Called when the CPU is idle.  It will choose the best C state to
- * be in.
+ * Put a CPU into "safe" mode with respect to power.
+ *
+ * Some systems cannot operate at a continuous "normal" speed without
+ * exceeding the thermal design.  This is called per-CPU to place the
+ * CPUs into a "safe" operating mode.
  */
 void
-machine_idle_cstate(void)
+pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags)
 {
-    if (pmDispatch != NULL && pmDispatch->cstateMachineIdle != NULL)
-       (*pmDispatch->cstateMachineIdle)(napCtl);
-    else {
-       sti();
-    }
+       if (pmDispatch != NULL
+           && pmDispatch->pmCPUSafeMode != NULL) {
+               pmDispatch->pmCPUSafeMode(lcpu, flags);
+       } else {
+               /*
+                * Do something reasonable if the KEXT isn't present.
+                *
+                * We only look at the PAUSE and RESUME flags.  The other flag(s)
+                * will not make any sense without the KEXT, so just ignore them.
+                *
+                * We set the CPU's state to indicate that it's halted.  If this
+                * is the CPU we're currently running on, then spin until the
+                * state becomes non-halted.
+                */
+               if (flags & PM_SAFE_FL_PAUSE) {
+                       lcpu->state = LCPU_PAUSE;
+                       if (lcpu == x86_lcpu()) {
+                               while (lcpu->state == LCPU_PAUSE) {
+                                       cpu_pause();
+                               }
+                       }
+               }
+
+               /*
+                * Clear the halted flag for the specified CPU, that will
+                * get it out of it's spin loop.
+                */
+               if (flags & PM_SAFE_FL_RESUME) {
+                       lcpu->state = LCPU_RUN;
+               }
+       }
 }
 
-static pmStats_t *
-pmsCPUStats(void)
+static uint32_t         saved_run_count = 0;
+
+void
+machine_run_count(uint32_t count)
 {
-    cpu_data_t *pp;
+       if (pmDispatch != NULL
+           && pmDispatch->pmSetRunCount != NULL) {
+               pmDispatch->pmSetRunCount(count);
+       } else {
+               saved_run_count = count;
+       }
+}
+
+processor_t
+machine_choose_processor(processor_set_t pset,
+    processor_t preferred)
+{
+       int         startCPU;
+       int         endCPU;
+       int         preferredCPU;
+       int         chosenCPU;
+
+       if (!pmInitDone) {
+               return preferred;
+       }
+
+       if (pset == NULL) {
+               startCPU = -1;
+               endCPU = -1;
+       } else {
+               startCPU = pset->cpu_set_low;
+               endCPU = pset->cpu_set_hi;
+       }
+
+       if (preferred == NULL) {
+               preferredCPU = -1;
+       } else {
+               preferredCPU = preferred->cpu_id;
+       }
+
+       if (pmDispatch != NULL
+           && pmDispatch->pmChooseCPU != NULL) {
+               chosenCPU = pmDispatch->pmChooseCPU(startCPU, endCPU, preferredCPU);
 
-    pp = current_cpu_datap();
-    return(&pp->cpu_pmStats);
+               if (chosenCPU == -1) {
+                       return NULL;
+               }
+               return cpu_datap(chosenCPU)->cpu_processor;
+       }
+
+       return preferred;
 }
 
-static pmsd *
-pmsCPUStepperData(void)
+static int
+pmThreadGetUrgency(uint64_t *rt_period, uint64_t *rt_deadline)
 {
-    cpu_data_t *pp;
+       thread_urgency_t urgency;
+       uint64_t        arg1, arg2;
+
+       urgency = thread_get_urgency(THREAD_NULL, &arg1, &arg2);
 
-    pp = current_cpu_datap();
-    return(&pp->pms);
+       if (urgency == THREAD_URGENCY_REAL_TIME) {
+               if (rt_period != NULL) {
+                       *rt_period = arg1;
+               }
+
+               if (rt_deadline != NULL) {
+                       *rt_deadline = arg2;
+               }
+       }
+
+       return (int)urgency;
 }
 
-static uint64_t *
-CPUHPETAddr(void)
+#if     DEBUG
+uint32_t        urgency_stats[64][THREAD_URGENCY_MAX];
+#endif
+
+#define         URGENCY_NOTIFICATION_ASSERT_NS (5 * 1000 * 1000)
+uint64_t        urgency_notification_assert_abstime_threshold, urgency_notification_max_recorded;
+
+void
+thread_tell_urgency(thread_urgency_t urgency,
+    uint64_t rt_period,
+    uint64_t rt_deadline,
+    uint64_t sched_latency,
+    thread_t nthread)
 {
-    cpu_data_t *pp;
-    pp = current_cpu_datap();
-    return(pp->cpu_pmHpet);
+       uint64_t        urgency_notification_time_start = 0, delta;
+       boolean_t       urgency_assert = (urgency_notification_assert_abstime_threshold != 0);
+       assert(get_preemption_level() > 0 || ml_get_interrupts_enabled() == FALSE);
+#if     DEBUG
+       urgency_stats[cpu_number() % 64][urgency]++;
+#endif
+       if (!pmInitDone
+           || pmDispatch == NULL
+           || pmDispatch->pmThreadTellUrgency == NULL) {
+               return;
+       }
+
+       SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
+
+       if (__improbable((urgency_assert == TRUE))) {
+               urgency_notification_time_start = mach_absolute_time();
+       }
+
+       current_cpu_datap()->cpu_nthread = nthread;
+       pmDispatch->pmThreadTellUrgency(urgency, rt_period, rt_deadline);
+
+       if (__improbable((urgency_assert == TRUE))) {
+               delta = mach_absolute_time() - urgency_notification_time_start;
+
+               if (__improbable(delta > urgency_notification_max_recorded)) {
+                       /* This is not synchronized, but it doesn't matter
+                        * if we (rarely) miss an event, as it is statistically
+                        * unlikely that it will never recur.
+                        */
+                       urgency_notification_max_recorded = delta;
+
+                       if (__improbable((delta > urgency_notification_assert_abstime_threshold) && !machine_timeout_suspended())) {
+                               panic("Urgency notification callout %p exceeded threshold, 0x%llx abstime units", pmDispatch->pmThreadTellUrgency, delta);
+                       }
+               }
+       }
+
+       SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
 }
 
-/*
- * Called by the power management kext to register itself and to get the
- * callbacks it might need into other power management functions.
- */
 void
-pmRegister(pmDispatch_t *cpuFuncs, pmCallBacks_t *callbacks)
-{
-    if (callbacks != NULL) {
-       callbacks->Park        = pmsPark;
-       callbacks->Run         = pmsRun;
-       callbacks->RunLocal    = pmsRunLocal;
-       callbacks->SetStep     = pmsSetStep;
-       callbacks->NapPolicy   = machine_nap_policy;
-       callbacks->Build       = pmsBuild;
-       callbacks->Stats       = pmsCPUStats;
-       callbacks->StepperData = pmsCPUStepperData;
-       callbacks->HPETAddr    = CPUHPETAddr;
-       callbacks->InitState   = &pmInitState;
-       callbacks->resetPop    = resetPop;
-    }
-
-    if (cpuFuncs != NULL)
-       pmDispatch = cpuFuncs;
+machine_thread_going_on_core(__unused thread_t      new_thread,
+    __unused thread_urgency_t           urgency,
+    __unused uint64_t      sched_latency,
+    __unused uint64_t      same_pri_latency,
+    __unused uint64_t      dispatch_time)
+{
 }
 
-/*
- * Unregisters the power management functions from the kext.
- */
 void
-pmUnRegister(pmDispatch_t *cpuFuncs)
+machine_thread_going_off_core(thread_t old_thread, boolean_t thread_terminating,
+    uint64_t last_dispatch, boolean_t thread_runnable)
 {
-    if (cpuFuncs != NULL && pmDispatch == cpuFuncs)
-       pmDispatch = NULL;
+       if (!pmInitDone
+           || pmDispatch == NULL
+           || pmDispatch->pmThreadGoingOffCore == NULL) {
+               return;
+       }
+
+       pmDispatch->pmThreadGoingOffCore(old_thread, thread_terminating,
+           last_dispatch, thread_runnable);
 }
 
-#if MACH_KDB
-/*
- * XXX stubs for now
- */
 void
-db_cfg(__unused db_expr_t addr,
-       __unused int have_addr,
-       __unused db_expr_t count,
-       __unused char *modif)
+machine_max_runnable_latency(__unused uint64_t bg_max_latency,
+    __unused uint64_t default_max_latency,
+    __unused uint64_t realtime_max_latency)
 {
-    return;
 }
 
 void
-db_display_iokit(__unused db_expr_t addr,
-                __unused int have_addr,
-                __unused db_expr_t count,
-                __unused char *modif)
+machine_work_interval_notify(__unused thread_t thread,
+    __unused struct kern_work_interval_args* kwi_args)
 {
-    return;
 }
 
+
 void
-db_dtimers(__unused db_expr_t addr,
-          __unused int have_addr,
-          __unused db_expr_t count,
-          __unused char *modif)
+machine_switch_perfcontrol_context(__unused perfcontrol_event event,
+    __unused uint64_t timestamp,
+    __unused uint32_t flags,
+    __unused uint64_t new_thread_same_pri_latency,
+    __unused thread_t old,
+    __unused thread_t new)
 {
-    return;
 }
 
 void
-db_intcnt(__unused db_expr_t addr,
-         __unused int have_addr,
-         __unused db_expr_t count,
-         __unused char *modif)
+machine_switch_perfcontrol_state_update(__unused perfcontrol_event event,
+    __unused uint64_t timestamp,
+    __unused uint32_t flags,
+    __unused thread_t thread)
 {
-    return;
 }
 
 void
-db_nap(__unused db_expr_t addr,
-       __unused int have_addr,
-       __unused db_expr_t count,
-       __unused char *modif)
+active_rt_threads(boolean_t active)
+{
+       if (!pmInitDone
+           || pmDispatch == NULL
+           || pmDispatch->pmActiveRTThreads == NULL) {
+               return;
+       }
+
+       pmDispatch->pmActiveRTThreads(active);
+}
+
+static uint32_t
+pmGetSavedRunCount(void)
+{
+       return saved_run_count;
+}
+
+/*
+ * Returns the root of the package tree.
+ */
+x86_pkg_t *
+pmGetPkgRoot(void)
+{
+       return x86_pkgs;
+}
+
+static boolean_t
+pmCPUGetHibernate(int cpu)
+{
+       return cpu_datap(cpu)->cpu_hibernate;
+}
+
+processor_t
+pmLCPUtoProcessor(int lcpu)
+{
+       return cpu_datap(lcpu)->cpu_processor;
+}
+
+static void
+pmReSyncDeadlines(int cpu)
+{
+       static boolean_t    registered      = FALSE;
+
+       if (!registered) {
+               PM_interrupt_register(&timer_resync_deadlines);
+               registered = TRUE;
+       }
+
+       if ((uint32_t)cpu == current_cpu_datap()->lcpu.cpu_num) {
+               timer_resync_deadlines();
+       } else {
+               cpu_PM_interrupt(cpu);
+       }
+}
+
+static void
+pmSendIPI(int cpu)
 {
-    return;
+       lapic_send_ipi(cpu, LAPIC_PM_INTERRUPT);
 }
 
+static void
+pmGetNanotimeInfo(pm_rtc_nanotime_t *rtc_nanotime)
+{
+       /*
+        * Make sure that nanotime didn't change while we were reading it.
+        */
+       do {
+               rtc_nanotime->generation = pal_rtc_nanotime_info.generation; /* must be first */
+               rtc_nanotime->tsc_base = pal_rtc_nanotime_info.tsc_base;
+               rtc_nanotime->ns_base = pal_rtc_nanotime_info.ns_base;
+               rtc_nanotime->scale = pal_rtc_nanotime_info.scale;
+               rtc_nanotime->shift = pal_rtc_nanotime_info.shift;
+       } while (pal_rtc_nanotime_info.generation != 0
+           && rtc_nanotime->generation != pal_rtc_nanotime_info.generation);
+}
+
+uint32_t
+pmTimerQueueMigrate(int target_cpu)
+{
+       /* Call the etimer code to do this. */
+       return (target_cpu != cpu_number())
+              ? timer_queue_migrate_cpu(target_cpu)
+              : 0;
+}
+
+
+/*
+ * Called by the power management kext to register itself and to get the
+ * callbacks it might need into other kernel functions.  This interface
+ * is versioned to allow for slight mis-matches between the kext and the
+ * kernel.
+ */
 void
-db_pmgr(__unused db_expr_t addr,
-       __unused int have_addr,
-       __unused db_expr_t count,
-       __unused char *modif)
+pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs,
+    pmCallBacks_t *callbacks)
 {
-    return;
+       if (callbacks != NULL && version == PM_DISPATCH_VERSION) {
+               callbacks->setRTCPop            = setPop;
+               callbacks->resyncDeadlines      = pmReSyncDeadlines;
+               callbacks->initComplete         = pmInitComplete;
+               callbacks->GetLCPU              = pmGetLogicalCPU;
+               callbacks->GetCore              = pmGetCore;
+               callbacks->GetDie               = pmGetDie;
+               callbacks->GetPackage           = pmGetPackage;
+               callbacks->GetMyLCPU            = pmGetMyLogicalCPU;
+               callbacks->GetMyCore            = pmGetMyCore;
+               callbacks->GetMyDie             = pmGetMyDie;
+               callbacks->GetMyPackage         = pmGetMyPackage;
+               callbacks->GetPkgRoot           = pmGetPkgRoot;
+               callbacks->LockCPUTopology      = pmLockCPUTopology;
+               callbacks->GetHibernate         = pmCPUGetHibernate;
+               callbacks->LCPUtoProcessor      = pmLCPUtoProcessor;
+               callbacks->ThreadBind           = thread_bind;
+               callbacks->GetSavedRunCount     = pmGetSavedRunCount;
+               callbacks->GetNanotimeInfo      = pmGetNanotimeInfo;
+               callbacks->ThreadGetUrgency     = pmThreadGetUrgency;
+               callbacks->RTCClockAdjust       = rtc_clock_adjust;
+               callbacks->timerQueueMigrate    = pmTimerQueueMigrate;
+               callbacks->topoParms            = &topoParms;
+               callbacks->pmSendIPI            = pmSendIPI;
+               callbacks->InterruptPending     = lapic_is_interrupt_pending;
+               callbacks->IsInterrupting       = lapic_is_interrupting;
+               callbacks->InterruptStats       = lapic_interrupt_counts;
+               callbacks->DisableApicTimer     = lapic_disable_timer;
+       } else {
+               panic("Version mis-match between Kernel and CPU PM");
+       }
+
+       if (cpuFuncs != NULL) {
+               if (pmDispatch) {
+                       panic("Attempt to re-register power management interface--AICPM present in xcpm mode? %p->%p", pmDispatch, cpuFuncs);
+               }
+
+               pmDispatch = cpuFuncs;
+
+               if (earlyTopology
+                   && pmDispatch->pmCPUStateInit != NULL) {
+                       (*pmDispatch->pmCPUStateInit)();
+                       earlyTopology = FALSE;
+               }
+
+               if (pmDispatch->pmIPIHandler != NULL) {
+                       lapic_set_pm_func((i386_intr_func_t)pmDispatch->pmIPIHandler);
+               }
+       }
 }
 
+/*
+ * Unregisters the power management functions from the kext.
+ */
 void
-db_test(__unused db_expr_t addr,
-       __unused int have_addr,
-       __unused db_expr_t count,
-       __unused char *modif)
+pmUnRegister(pmDispatch_t *cpuFuncs)
 {
-    return;
+       if (cpuFuncs != NULL && pmDispatch == cpuFuncs) {
+               pmDispatch = NULL;
+       }
 }
 
 void
-db_getpmgr(__unused pmData_t *pmj)
+machine_track_platform_idle(boolean_t entry)
 {
+       cpu_data_t              *my_cpu         = current_cpu_datap();
+
+       if (entry) {
+               (void)__sync_fetch_and_add(&my_cpu->lcpu.package->num_idle, 1);
+       } else {
+               uint32_t nidle = __sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1);
+               if (nidle == topoParms.nLThreadsPerPackage) {
+                       my_cpu->lcpu.package->package_idle_exits++;
+               }
+       }
 }
-#endif