X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..4d15aeb193b2c68f1d38666c317f8d3734f5f083:/osfmk/i386/pmCPU.c

diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c
index 1f12073fb..5791823d7 100644
--- a/osfmk/i386/pmCPU.c
+++ b/osfmk/i386/pmCPU.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2011 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -31,286 +31,196 @@
  *
  * Implements the "wrappers" to the KEXT.
  */
-#include <i386/machine_routines.h>
-#include <i386/machine_cpu.h>
-#include <i386/misc_protos.h>
-#include <i386/pmap.h>
 #include <i386/asm.h>
+#include <i386/machine_cpu.h>
 #include <i386/mp.h>
+#include <i386/machine_routines.h>
 #include <i386/proc_reg.h>
+#include <i386/pmap.h>
+#include <i386/misc_protos.h>
+#include <kern/machine.h>
 #include <kern/pms.h>
 #include <kern/processor.h>
+#include <kern/timer_queue.h>
 #include <i386/cpu_threads.h>
 #include <i386/pmCPU.h>
 #include <i386/cpuid.h>
-#include <i386/rtclock.h>
+#include <i386/rtclock_protos.h>
+#include <kern/sched_prim.h>
+#include <i386/lapic.h>
+#include <i386/pal_routines.h>
+#include <sys/kdebug.h>
+#include <i386/tsc.h>
 
 extern int disableConsoleOutput;
 
-decl_simple_lock_data(,pm_init_lock);
+#define DELAY_UNSET		0xFFFFFFFFFFFFFFFFULL
+
+uint64_t cpu_itime_bins[CPU_ITIME_BINS] = {16* NSEC_PER_USEC, 32* NSEC_PER_USEC, 64* NSEC_PER_USEC, 128* NSEC_PER_USEC, 256* NSEC_PER_USEC, 512* NSEC_PER_USEC, 1024* NSEC_PER_USEC, 2048* NSEC_PER_USEC, 4096* NSEC_PER_USEC, 8192* NSEC_PER_USEC, 16384* NSEC_PER_USEC, 32768* NSEC_PER_USEC};
+uint64_t *cpu_rtime_bins = &cpu_itime_bins[0];
 
 /*
  * The following is set when the KEXT loads and initializes.
  */
 pmDispatch_t	*pmDispatch	= NULL;
 
-/*
- * Current power management states (for use until KEXT is loaded).
- */
-static pmInitState_t	pmInitState;
-
-static uint32_t		pmInitDone	= 0;
+uint32_t		pmInitDone		= 0;
+static boolean_t	earlyTopology		= FALSE;
+static uint64_t		earlyMaxBusDelay	= DELAY_UNSET;
+static uint64_t		earlyMaxIntDelay	= DELAY_UNSET;
 
 /*
- * Nap control variables:
- */
-uint32_t forcenap = 0;			/* Force nap (fn) boot-arg controls */
-
-/*
- * Do any initialization needed
- */
-void
-pmsInit(void)
-{
-    static int		initialized	= 0;
-
-    /*
-     * Initialize some of the initial state to "uninitialized" until
-     * it gets set with something more useful.  This allows the KEXT
-     * to determine if the initial value was actually set to something.
-     */
-    if (!initialized) {
-	pmInitState.PState = -1;
-	pmInitState.PLimit = -1;
-	pmInitState.maxBusDelay = -1;
-	initialized = 1;
-    }
-
-    if (pmDispatch != NULL && pmDispatch->pmsInit != NULL)
-	(*pmDispatch->pmsInit)();
-}
-
-/*
- * Start the power management stepper on all processors
- *
- * All processors must be parked.  This should be called when the hardware
- * is ready to step.  Probably only at boot and after wake from sleep.
- *
+ * Initialize the Cstate change code.
  */
 void
-pmsStart(void)
+power_management_init(void)
 {
-    if (pmDispatch != NULL && pmDispatch->pmsStart != NULL)
-	(*pmDispatch->pmsStart)();
+    if (pmDispatch != NULL && pmDispatch->cstateInit != NULL)
+	(*pmDispatch->cstateInit)();
 }
 
-/*
- * Park the stepper execution.  This will force the stepper on this
- * processor to abandon its current step and stop.  No changes to the
- * hardware state is made and any previous step is lost.
- *	
- * This is used as the initial state at startup and when the step table
- * is being changed.
- *
- */
-void
-pmsPark(void)
-{
-    if (pmDispatch != NULL && pmDispatch->pmsPark != NULL)
-	(*pmDispatch->pmsPark)();
+static inline void machine_classify_interval(uint64_t interval, uint64_t *bins, uint64_t *binvals, uint32_t nbins) {
+	uint32_t i;
+ 	for (i = 0; i < nbins; i++) {
+ 		if (interval < binvals[i]) {
+ 			bins[i]++;
+ 			break;
+ 		}
+ 	}
 }
 
-/*
- * Control the Power Management Stepper.
- * Called from user state by the superuser.
- * Interrupts disabled.
- *
- * This interface is deprecated and is now a no-op.
- */
-kern_return_t
-pmsControl(__unused uint32_t request, __unused user_addr_t reqaddr,
-	   __unused uint32_t reqsize)
-{
-    return(KERN_SUCCESS);
-}
+uint64_t	idle_pending_timers_processed;
+uint32_t	idle_entry_timer_processing_hdeadline_threshold = 5000000;
 
 /*
- * Broadcast a change to all processors including ourselves.
- *
- * Interrupts disabled.
+ * Called when the CPU is idle.  It calls into the power management kext
+ * to determine the best way to idle the CPU.
  */
 void
-pmsRun(uint32_t nstep)
-{
-    if (pmDispatch != NULL && pmDispatch->pmsRun != NULL)
-	(*pmDispatch->pmsRun)(nstep);
-}
-
-/*
- * Build the tables needed for the stepper.  This includes both the step
- * definitions and the step control table.
- *
- * We most absolutely need to be parked before this happens because we're
- * going to change the table.  We also have to be complte about checking
- * for errors.  A copy is always made because we don't want to be crippled
- * by not being able to change the table or description formats.
- *
- * We pass in a table of external functions and the new stepper def uses
- * the corresponding indexes rather than actual function addresses.  This
- * is done so that a proper table can be built with the control syscall.
- * It can't supply addresses, so the index has to do.  We internalize the
- * table so our caller does not need to keep it.  Note that passing in a 0
- * will use the current function table.  Also note that entry 0 is reserved
- * and must be 0, we will check and fail the build.
- *
- * The platformData parameter is a 32-bit word of data that is passed unaltered
- * to the set function.
- *
- * The queryFunc parameter is the address of a function that will return the
- * current state of the platform. The format of the data returned is the same
- * as the platform specific portions of pmsSetCmd, i.e., pmsXClk, pmsVoltage,
- * and any part of pmsPowerID that is maintained by the platform hardware
- * (an example would be the values of the gpios that correspond to pmsPowerID).
- * The value should be constructed by querying hardware rather than returning
- * a value cached by software. One of the intents of this function is to help
- * recover lost or determine initial power states.
- *
- */
-kern_return_t
-pmsBuild(pmsDef *pd, uint32_t pdsize, pmsSetFunc_t *functab,
-	 uint32_t platformData, pmsQueryFunc_t queryFunc)
-{
-    kern_return_t	rc	= 0;
-
-    if (pmDispatch != NULL && pmDispatch->pmsBuild != NULL)
-	rc = (*pmDispatch->pmsBuild)(pd, pdsize, functab,
-				     platformData, queryFunc);
-
-    return(rc);
-}
-
-
-/*
- * Load a new ratio/VID table.
- *
- * Note that this interface is specific to the Intel SpeedStep implementation.
- * It is expected that this will only be called once to override the default
- * ratio/VID table when the platform starts.
- *
- * Normally, the table will need to be replaced at the same time that the
- * stepper program proper is replaced, as the PState indices from an old
- * program may no longer be valid.  When replacing the default program this
- * should not be a problem as any new table will have at least two PState
- * entries and the default program only references P0 and P1.
- */
-kern_return_t
-pmsCPULoadVIDTable(uint16_t *tablep, int nstates)
-{
-    if (pmDispatch != NULL && pmDispatch->pmsCPULoadVIDTable != NULL)
-	return((*pmDispatch->pmsCPULoadVIDTable)(tablep, nstates));
-    else {
-	int	i;
-
-	if (nstates > MAX_PSTATES)
-	    return(KERN_FAILURE);
-
-	for (i = 0; i < nstates; i += 1)
-	    pmInitState.VIDTable[i] = tablep[i];
-    }
-    return(KERN_SUCCESS);
-}
-
-/*
- * Set the (global) PState limit.  CPUs will not be permitted to run at
- * a lower (more performant) PState than this.
- */
-kern_return_t
-pmsCPUSetPStateLimit(uint32_t limit)
-{
-    if (pmDispatch != NULL && pmDispatch->pmsCPUSetPStateLimit != NULL)
-	return((*pmDispatch->pmsCPUSetPStateLimit)(limit));
-
-    pmInitState.PLimit = limit;
-    return(KERN_SUCCESS);
-}
+machine_idle(void)
+{
+	cpu_data_t		*my_cpu		= current_cpu_datap();
+	__unused uint32_t	cnum = my_cpu->cpu_number;
+	uint64_t		ctime, rtime, itime;
+#if CST_DEMOTION_DEBUG
+	processor_t		cproc = my_cpu->cpu_processor;
+	uint64_t		cwakeups = PROCESSOR_DATA(cproc, wakeups_issued_total);
+#endif /* CST_DEMOTION_DEBUG */
+	uint64_t esdeadline, ehdeadline;
+	boolean_t do_process_pending_timers = FALSE;
+
+	ctime = mach_absolute_time();
+	esdeadline = my_cpu->rtclock_timer.queue.earliest_soft_deadline;
+	ehdeadline = my_cpu->rtclock_timer.deadline;
+/* Determine if pending timers exist */    
+	if ((ctime >= esdeadline) && (ctime < ehdeadline) &&
+	    ((ehdeadline - ctime) < idle_entry_timer_processing_hdeadline_threshold)) {
+		idle_pending_timers_processed++;
+		do_process_pending_timers = TRUE;
+		goto machine_idle_exit;
+	} else {
+		TCOAL_DEBUG(0xCCCC0000, ctime, my_cpu->rtclock_timer.queue.earliest_soft_deadline, my_cpu->rtclock_timer.deadline, idle_pending_timers_processed, 0);
+	}
+    
+	my_cpu->lcpu.state = LCPU_IDLE;
+	DBGLOG(cpu_handle, cpu_number(), MP_IDLE);
+	MARK_CPU_IDLE(cnum);
+
+	rtime = ctime - my_cpu->cpu_ixtime;
+
+	my_cpu->cpu_rtime_total += rtime;
+	machine_classify_interval(rtime, &my_cpu->cpu_rtimes[0], &cpu_rtime_bins[0], CPU_RTIME_BINS);
+#if CST_DEMOTION_DEBUG
+	uint32_t cl = 0, ch = 0;
+	uint64_t c3res, c6res, c7res;
+	rdmsr_carefully(MSR_IA32_CORE_C3_RESIDENCY, &cl, &ch);
+	c3res = ((uint64_t)ch << 32) | cl;
+	rdmsr_carefully(MSR_IA32_CORE_C6_RESIDENCY, &cl, &ch);
+	c6res = ((uint64_t)ch << 32) | cl;
+	rdmsr_carefully(MSR_IA32_CORE_C7_RESIDENCY, &cl, &ch);
+	c7res = ((uint64_t)ch << 32) | cl;
+#endif
+
+	if (pmInitDone) {
+		/*
+		 * Handle case where ml_set_maxbusdelay() or ml_set_maxintdelay()
+		 * were called prior to the CPU PM kext being registered.  We do
+		 * this here since we know at this point the values will be first
+		 * used since idle is where the decisions using these values is made.
+		 */
+		if (earlyMaxBusDelay != DELAY_UNSET)
+			ml_set_maxbusdelay((uint32_t)(earlyMaxBusDelay & 0xFFFFFFFF));
+		if (earlyMaxIntDelay != DELAY_UNSET)
+			ml_set_maxintdelay(earlyMaxIntDelay);
+	}
 
-/*
- * Initialize the Cstate change code.
- */
-void
-power_management_init(void)
-{
-    static boolean_t	initialized	= FALSE;
+	if (pmInitDone
+	    && pmDispatch != NULL
+	    && pmDispatch->MachineIdle != NULL)
+		(*pmDispatch->MachineIdle)(0x7FFFFFFFFFFFFFFFULL);
+	else {
+		/*
+		 * If no power management, re-enable interrupts and halt.
+		 * This will keep the CPU from spinning through the scheduler
+		 * and will allow at least some minimal power savings (but it
+		 * cause problems in some MP configurations w.r.t. the APIC
+		 * stopping during a GV3 transition).
+		 */
+		pal_hlt();
+		/* Once woken, re-disable interrupts. */
+		pal_cli();
+	}
 
-    /*
-     * Initialize the lock for the KEXT initialization.
-     */
-    if (!initialized) {
-	simple_lock_init(&pm_init_lock, 0);
-	initialized = TRUE;
-    }
+	/*
+	 * Mark the CPU as running again.
+	 */
+	MARK_CPU_ACTIVE(cnum);
+	DBGLOG(cpu_handle, cnum, MP_UNIDLE);
+	my_cpu->lcpu.state = LCPU_RUN;
+	uint64_t ixtime = my_cpu->cpu_ixtime = mach_absolute_time();
+	itime = ixtime - ctime;
+	my_cpu->cpu_idle_exits++;
+        my_cpu->cpu_itime_total += itime;
+    	machine_classify_interval(itime, &my_cpu->cpu_itimes[0], &cpu_itime_bins[0], CPU_ITIME_BINS);
+#if CST_DEMOTION_DEBUG
+	cl = ch = 0;
+	rdmsr_carefully(MSR_IA32_CORE_C3_RESIDENCY, &cl, &ch);
+	c3res = (((uint64_t)ch << 32) | cl) - c3res;
+	rdmsr_carefully(MSR_IA32_CORE_C6_RESIDENCY, &cl, &ch);
+	c6res = (((uint64_t)ch << 32) | cl) - c6res;
+	rdmsr_carefully(MSR_IA32_CORE_C7_RESIDENCY, &cl, &ch);
+	c7res = (((uint64_t)ch << 32) | cl) - c7res;
+
+	uint64_t ndelta = itime - tmrCvt(c3res + c6res + c7res, tscFCvtt2n);
+	KERNEL_DEBUG_CONSTANT(0xcead0000, ndelta, itime, c7res, c6res, c3res);
+	if ((itime > 1000000) && (ndelta > 250000))
+		KERNEL_DEBUG_CONSTANT(0xceae0000, ndelta, itime, c7res, c6res, c3res);
+#endif
+
+	machine_idle_exit:
+	/*
+	 * Re-enable interrupts.
+	 */
 
-    if (pmDispatch != NULL && pmDispatch->cstateInit != NULL)
-	(*pmDispatch->cstateInit)();
-}
+	pal_sti();
 
-/*
- * ACPI calls the following routine to set/update mwait hints.  A table
- * (possibly null) specifies the available Cstates and their hints, all
- * other states are assumed to be invalid.  ACPI may update available
- * states to change the nap policy (for example, while AC power is
- * available).
- */
-kern_return_t
-Cstate_table_set(Cstate_hint_t *tablep, unsigned int nstates)
-{
-    if (forcenap)
-	return(KERN_SUCCESS);
+	if (do_process_pending_timers) {
+		TCOAL_DEBUG(0xBBBB0000 | DBG_FUNC_START, ctime, esdeadline, ehdeadline, idle_pending_timers_processed, 0);
 
-    if (pmDispatch != NULL && pmDispatch->cstateTableSet != NULL)
-	return((*pmDispatch->cstateTableSet)(tablep, nstates));
-    else {
-	unsigned int	i;
+		/* Adjust to reflect that this isn't truly a package idle exit */
+		__sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1);
+		lapic_timer_swi(); /* Trigger software timer interrupt */
+		__sync_fetch_and_add(&my_cpu->lcpu.package->num_idle, 1);
 
-	for (i = 0; i < nstates; i += 1) {
-	    pmInitState.CStates[i].number = tablep[i].number;
-	    pmInitState.CStates[i].hint   = tablep[i].hint;
+		TCOAL_DEBUG(0xBBBB0000 | DBG_FUNC_END, ctime, esdeadline, idle_pending_timers_processed, 0, 0);
 	}
+#if CST_DEMOTION_DEBUG
+	uint64_t nwakeups = PROCESSOR_DATA(cproc, wakeups_issued_total);
 
-	pmInitState.CStatesCount = nstates;
-    }
-    return(KERN_SUCCESS);
-}
-
-/*
- * Called when the CPU is idle.  It will choose the best C state to
- * be in.
- */
-void
-machine_idle_cstate(boolean_t halted)
-{
-	if (pmInitDone
-	    && pmDispatch != NULL
-	    && pmDispatch->cstateMachineIdle != NULL)
-		(*pmDispatch->cstateMachineIdle)(!halted ?
-						 0x7FFFFFFFFFFFFFFFULL : 0ULL);
-	else if (halted) {
-	    /*
-	     * If no power managment and a processor is taken off-line,
-	     * then invalidate the cache and halt it (it will not be able
-	     * to be brought back on-line without resetting the CPU).
-	     */
-	    __asm__ volatile ( "wbinvd; hlt" );
-	} else {
-	    /*
-	     * If no power management, re-enable interrupts and halt.
-	     * This will keep the CPU from spinning through the scheduler
-	     * and will allow at least some minimal power savings (but it
-	     * may cause problems in some MP configurations w.r.t to the
-	     * APIC stopping during a P-State transition).
-	     */
-	    __asm__ volatile ( "sti; hlt" );
+	if ((nwakeups == cwakeups) && (topoParms.nLThreadsPerPackage == my_cpu->lcpu.package->num_idle)) {
+		KERNEL_DEBUG_CONSTANT(0xceaa0000, cwakeups, 0, 0, 0, 0);
 	}
+#endif    
 }
 
 /*
@@ -320,63 +230,87 @@ machine_idle_cstate(boolean_t halted)
 void
 pmCPUHalt(uint32_t reason)
 {
+    cpu_data_t	*cpup	= current_cpu_datap();
 
     switch (reason) {
     case PM_HALT_DEBUG:
-	__asm__ volatile ("wbinvd; hlt");
+	cpup->lcpu.state = LCPU_PAUSE;
+	pal_stop_cpu(FALSE);
 	break;
 
     case PM_HALT_PANIC:
-	__asm__ volatile ("cli; wbinvd; hlt");
+	cpup->lcpu.state = LCPU_PAUSE;
+	pal_stop_cpu(TRUE);
 	break;
 
     case PM_HALT_NORMAL:
+    case PM_HALT_SLEEP:
     default:
-	__asm__ volatile ("cli");
+        pal_cli();
 
 	if (pmInitDone
 	    && pmDispatch != NULL
 	    && pmDispatch->pmCPUHalt != NULL) {
+	    /*
+	     * Halt the CPU (and put it in a low power state.
+	     */
 	    (*pmDispatch->pmCPUHalt)();
-	} else {
-	    cpu_data_t	*cpup	= current_cpu_datap();
 
+	    /*
+	     * We've exited halt, so get the CPU schedulable again.
+	     * - by calling the fast init routine for a slave, or
+	     * - by returning if we're the master processor.
+	     */
+	    if (cpup->cpu_number != master_cpu) {
+		i386_init_slave_fast();
+		panic("init_slave_fast returned");
+	    }
+	} else
+	{
 	    /*
 	     * If no power managment and a processor is taken off-line,
 	     * then invalidate the cache and halt it (it will not be able
 	     * to be brought back on-line without resetting the CPU).
 	     */
 	    __asm__ volatile ("wbinvd");
-	    cpup->lcpu.halted = TRUE;
-	    __asm__ volatile ( "wbinvd; hlt" );
+	    cpup->lcpu.state = LCPU_HALT;
+	    pal_stop_cpu(FALSE);
+
+	    panic("back from Halt");
 	}
+
 	break;
     }
 }
 
-/*
- * Called to initialize the power management structures for the CPUs.
- */
 void
-pmCPUStateInit(void)
+pmMarkAllCPUsOff(void)
 {
-    if (pmDispatch != NULL && pmDispatch->pmCPUStateInit != NULL)
-	(*pmDispatch->pmCPUStateInit)();
+    if (pmInitDone
+	&& pmDispatch != NULL
+	&& pmDispatch->markAllCPUsOff != NULL)
+	(*pmDispatch->markAllCPUsOff)();
 }
 
 static void
 pmInitComplete(void)
 {
+    if (earlyTopology
+	&& pmDispatch != NULL
+	&& pmDispatch->pmCPUStateInit != NULL) {
+	(*pmDispatch->pmCPUStateInit)();
+	earlyTopology = FALSE;
+    }
     pmInitDone = 1;
 }
 
-static x86_lcpu_t *
+x86_lcpu_t *
 pmGetLogicalCPU(int cpu)
 {
     return(cpu_to_lcpu(cpu));
 }
 
-static x86_lcpu_t *
+x86_lcpu_t *
 pmGetMyLogicalCPU(void)
 {
     cpu_data_t	*cpup	= current_cpu_datap();
@@ -398,6 +332,20 @@ pmGetMyCore(void)
     return(cpup->lcpu.core);
 }
 
+static x86_die_t *
+pmGetDie(int cpu)
+{
+    return(cpu_to_die(cpu));
+}
+
+static x86_die_t *
+pmGetMyDie(void)
+{
+    cpu_data_t	*cpup	= current_cpu_datap();
+
+    return(cpup->lcpu.die);
+}
+
 static x86_pkg_t *
 pmGetPackage(int cpu)
 {
@@ -409,7 +357,7 @@ pmGetMyPackage(void)
 {
     cpu_data_t	*cpup	= current_cpu_datap();
 
-    return(cpup->lcpu.core->package);
+    return(cpup->lcpu.package);
 }
 
 static void
@@ -425,11 +373,13 @@ pmLockCPUTopology(int lock)
 /*
  * Called to get the next deadline that has been set by the
  * power management code.
+ * Note: a return of 0 from AICPM and this routine signifies
+ * that no deadline is set.
  */
 uint64_t
 pmCPUGetDeadline(cpu_data_t *cpu)
 {
-    uint64_t	deadline	= EndOfAllTime;
+    uint64_t	deadline	= 0;
 
     if (pmInitDone
 	&& pmDispatch != NULL
@@ -443,10 +393,11 @@ pmCPUGetDeadline(cpu_data_t *cpu)
  * Called to determine if the supplied deadline or the power management
  * deadline is sooner.  Returns which ever one is first.
  */
+
 uint64_t
 pmCPUSetDeadline(cpu_data_t *cpu, uint64_t deadline)
 {
-    if (pmInitDone
+   if (pmInitDone
 	&& pmDispatch != NULL
 	&& pmDispatch->SetDeadline != NULL)
 	deadline = (*pmDispatch->SetDeadline)(&cpu->lcpu, deadline);
@@ -484,29 +435,58 @@ pmCPUExitIdle(cpu_data_t *cpu)
     return(do_ipi);
 }
 
+kern_return_t
+pmCPUExitHalt(int cpu)
+{
+    kern_return_t	rc	= KERN_INVALID_ARGUMENT;
+
+    if (pmInitDone
+	&& pmDispatch != NULL
+	&& pmDispatch->exitHalt != NULL)
+	rc = pmDispatch->exitHalt(cpu_to_lcpu(cpu));
+
+    return(rc);
+}
+
+kern_return_t
+pmCPUExitHaltToOff(int cpu)
+{
+    kern_return_t	rc	= KERN_SUCCESS;
+
+    if (pmInitDone
+	&& pmDispatch != NULL
+	&& pmDispatch->exitHaltToOff != NULL)
+	rc = pmDispatch->exitHaltToOff(cpu_to_lcpu(cpu));
+
+    return(rc);
+}
+
 /*
- * Called when a CPU is being restarted after being powered off (as in S3).
+ * Called to initialize the power management structures for the CPUs.
  */
 void
-pmCPUMarkRunning(cpu_data_t *cpu)
+pmCPUStateInit(void)
 {
-    if (pmInitDone
-	&& pmDispatch != NULL
-	&& pmDispatch->markCPURunning != NULL)
-	(*pmDispatch->markCPURunning)(&cpu->lcpu);
+    if (pmDispatch != NULL && pmDispatch->pmCPUStateInit != NULL)
+	(*pmDispatch->pmCPUStateInit)();
+    else
+	earlyTopology = TRUE;
 }
 
 /*
- * Called from the HPET interrupt handler to perform the
- * necessary power management work.
+ * Called when a CPU is being restarted after being powered off (as in S3).
  */
 void
-pmHPETInterrupt(void)
+pmCPUMarkRunning(cpu_data_t *cpu)
 {
+    cpu_data_t	*cpup	= current_cpu_datap();
+
     if (pmInitDone
 	&& pmDispatch != NULL
-	&& pmDispatch->HPETInterrupt != NULL)
-	(*pmDispatch->HPETInterrupt)();
+	&& pmDispatch->markCPURunning != NULL)
+	(*pmDispatch->markCPURunning)(&cpu->lcpu);
+    else
+	cpup->lcpu.state = LCPU_RUN;
 }
 
 /*
@@ -524,6 +504,30 @@ pmCPUControl(uint32_t cmd, void *datap)
     return(rc);
 }
 
+/*
+ * Called to save the timer state used by power management prior
+ * to "sleeping".
+ */
+void
+pmTimerSave(void)
+{
+    if (pmDispatch != NULL
+	&& pmDispatch->pmTimerStateSave != NULL)
+	(*pmDispatch->pmTimerStateSave)();
+}
+
+/*
+ * Called to restore the timer state used by power management after
+ * waking from "sleep".
+ */
+void
+pmTimerRestore(void)
+{
+    if (pmDispatch != NULL
+	&& pmDispatch->pmTimerStateRestore != NULL)
+	(*pmDispatch->pmTimerStateRestore)();
+}
+
 /*
  * Set the worst-case time for the C4 to C2 transition.
  * No longer does anything.
@@ -542,7 +546,8 @@ ml_get_maxsnoop(void)
 {
     uint64_t	max_snoop	= 0;
 
-    if (pmDispatch != NULL
+    if (pmInitDone
+	&& pmDispatch != NULL
 	&& pmDispatch->getMaxSnoop != NULL)
 	max_snoop = pmDispatch->getMaxSnoop();
 
@@ -555,7 +560,8 @@ ml_get_maxbusdelay(void)
 {
     uint64_t	max_delay	= 0;
 
-    if (pmDispatch != NULL
+    if (pmInitDone
+	&& pmDispatch != NULL
 	&& pmDispatch->getMaxBusDelay != NULL)
 	max_delay = pmDispatch->getMaxBusDelay();
 
@@ -563,12 +569,7 @@ ml_get_maxbusdelay(void)
 }
 
 /*
- * Set the maximum delay time allowed for snoop on the bus.
- *
- * Note that this value will be compared to the amount of time that it takes
- * to transition from a non-snooping power state (C4) to a snooping state (C2).
- * If maxBusDelay is less than C4C2SnoopDelay,
- * we will not enter the lowest power state.
+ * Advertise a memory access latency tolerance of "mdelay" ns
  */
 void
 ml_set_maxbusdelay(uint32_t mdelay)
@@ -576,10 +577,50 @@ ml_set_maxbusdelay(uint32_t mdelay)
     uint64_t	maxdelay	= mdelay;
 
     if (pmDispatch != NULL
-	&& pmDispatch->setMaxBusDelay != NULL)
+	&& pmDispatch->setMaxBusDelay != NULL) {
+	earlyMaxBusDelay = DELAY_UNSET;
 	pmDispatch->setMaxBusDelay(maxdelay);
-    else
-	pmInitState.maxBusDelay = maxdelay;
+    } else
+	earlyMaxBusDelay = maxdelay;
+}
+
+uint64_t
+ml_get_maxintdelay(void)
+{
+    uint64_t	max_delay	= 0;
+
+    if (pmDispatch != NULL
+	&& pmDispatch->getMaxIntDelay != NULL)
+	max_delay = pmDispatch->getMaxIntDelay();
+
+    return(max_delay);
+}
+
+/*
+ * Set the maximum delay allowed for an interrupt.
+ */
+void
+ml_set_maxintdelay(uint64_t mdelay)
+{
+    if (pmDispatch != NULL
+	&& pmDispatch->setMaxIntDelay != NULL) {
+	earlyMaxIntDelay = DELAY_UNSET;
+	pmDispatch->setMaxIntDelay(mdelay);
+    } else
+	earlyMaxIntDelay = mdelay;
+}
+
+boolean_t
+ml_get_interrupt_prewake_applicable()
+{
+    boolean_t applicable = FALSE;
+
+    if (pmInitDone 
+	&& pmDispatch != NULL
+	&& pmDispatch->pmInterruptPrewakeApplicable != NULL)
+	applicable = pmDispatch->pmInterruptPrewakeApplicable();
+
+    return applicable;
 }
 
 /*
@@ -602,15 +643,14 @@ pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags)
 	 * We only look at the PAUSE and RESUME flags.  The other flag(s)
 	 * will not make any sense without the KEXT, so just ignore them.
 	 *
-	 * We set the halted flag in the LCPU structure to indicate
-	 * that this CPU isn't to do anything.  If it's the CPU we're
-	 * currently running on, then spin until the halted flag is
-	 * reset.
+	 * We set the CPU's state to indicate that it's halted.  If this
+	 * is the CPU we're currently running on, then spin until the
+	 * state becomes non-halted.
 	 */
 	if (flags & PM_SAFE_FL_PAUSE) {
-	    lcpu->halted = TRUE;
+	    lcpu->state = LCPU_PAUSE;
 	    if (lcpu == x86_lcpu()) {
-		while (lcpu->halted)
+		while (lcpu->state == LCPU_PAUSE)
 		    cpu_pause();
 	    }
 	}
@@ -620,15 +660,183 @@ pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags)
 	 * get it out of it's spin loop.
 	 */
 	if (flags & PM_SAFE_FL_RESUME) {
-	    lcpu->halted = FALSE;
+	    lcpu->state = LCPU_RUN;
 	}
     }
 }
 
+static uint32_t		saved_run_count = 0;
+
+void
+machine_run_count(uint32_t count)
+{
+    if (pmDispatch != NULL
+	&& pmDispatch->pmSetRunCount != NULL)
+	pmDispatch->pmSetRunCount(count);
+    else
+	saved_run_count = count;
+}
+
+processor_t
+machine_choose_processor(processor_set_t pset,
+			 processor_t preferred)
+{
+    int		startCPU;
+    int		endCPU;
+    int		preferredCPU;
+    int		chosenCPU;
+
+    if (!pmInitDone)
+	return(preferred);
+
+    if (pset == NULL) {
+	startCPU = -1;
+	endCPU = -1;
+    } else {
+	startCPU = pset->cpu_set_low;
+	endCPU = pset->cpu_set_hi;
+    }
+
+    if (preferred == NULL)
+	preferredCPU = -1;
+    else
+	preferredCPU = preferred->cpu_id;
+
+    if (pmDispatch != NULL
+	&& pmDispatch->pmChooseCPU != NULL) {
+	chosenCPU = pmDispatch->pmChooseCPU(startCPU, endCPU, preferredCPU);
+
+	if (chosenCPU == -1)
+	    return(NULL);
+	return(cpu_datap(chosenCPU)->cpu_processor);
+    }
+
+    return(preferred);
+}
+
+static int
+pmThreadGetUrgency(uint64_t *rt_period, uint64_t *rt_deadline)
+{
+	int             urgency;
+	uint64_t        arg1, arg2;
+
+	urgency = thread_get_urgency(current_processor()->next_thread, &arg1, &arg2);
+
+	if (urgency == THREAD_URGENCY_REAL_TIME) {
+		if (rt_period != NULL)
+			*rt_period = arg1;
+		
+		if (rt_deadline != NULL)
+			*rt_deadline = arg2;
+	}
+
+	return(urgency);
+}
+
+#if	DEBUG
+uint32_t	urgency_stats[64][THREAD_URGENCY_MAX];
+#endif
+
+#define		URGENCY_NOTIFICATION_ASSERT_NS (5 * 1000 * 1000)
+uint64_t	urgency_notification_assert_abstime_threshold, urgency_notification_max_recorded;
+
+void
+thread_tell_urgency(int urgency,
+    uint64_t rt_period,
+    uint64_t rt_deadline,
+    uint64_t sched_latency,
+    thread_t nthread)
+{
+	uint64_t	urgency_notification_time_start, delta;
+	boolean_t	urgency_assert = (urgency_notification_assert_abstime_threshold != 0);
+	assert(get_preemption_level() > 0 || ml_get_interrupts_enabled() == FALSE);
+#if	DEBUG
+	urgency_stats[cpu_number() % 64][urgency]++;
+#endif
+	if (!pmInitDone
+	    || pmDispatch == NULL
+	    || pmDispatch->pmThreadTellUrgency == NULL)
+		return;
+
+	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
+
+	if (__improbable((urgency_assert == TRUE)))
+		urgency_notification_time_start = mach_absolute_time();
+
+	current_cpu_datap()->cpu_nthread = nthread;
+	pmDispatch->pmThreadTellUrgency(urgency, rt_period, rt_deadline);
+
+	if (__improbable((urgency_assert == TRUE))) {
+		delta = mach_absolute_time() - urgency_notification_time_start;
+
+		if (__improbable(delta > urgency_notification_max_recorded)) {
+			/* This is not synchronized, but it doesn't matter
+			 * if we (rarely) miss an event, as it is statistically
+			 * unlikely that it will never recur.
+			 */
+			urgency_notification_max_recorded = delta;
+
+			if (__improbable((delta > urgency_notification_assert_abstime_threshold) && !machine_timeout_suspended()))
+				panic("Urgency notification callout %p exceeded threshold, 0x%llx abstime units", pmDispatch->pmThreadTellUrgency, delta);
+		}
+	}
+
+	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
+}
+
+void
+machine_thread_going_on_core(__unused thread_t      new_thread,
+							 __unused int           urgency,
+							 __unused uint64_t      sched_latency,
+							 __unused uint64_t      dispatch_time)
+{
+}
+
+void
+machine_thread_going_off_core(__unused thread_t old_thread, __unused boolean_t thread_terminating, __unused uint64_t last_dispatch)
+{
+}
+
+void
+machine_max_runnable_latency(__unused uint64_t bg_max_latency,
+							 __unused uint64_t default_max_latency,
+							 __unused uint64_t realtime_max_latency)
+{
+}
+
+void
+machine_work_interval_notify(__unused thread_t thread,
+							 __unused uint64_t work_interval_id,
+							 __unused uint64_t start_abstime,
+							 __unused uint64_t finish_abstime,
+							 __unused uint64_t deadline_abstime,
+							 __unused uint64_t next_start_abstime,
+							 __unused uint16_t urgency,
+							 __unused uint32_t flags)
+{
+}
+
+void
+active_rt_threads(boolean_t active)
+{
+    if (!pmInitDone
+	|| pmDispatch == NULL
+	|| pmDispatch->pmActiveRTThreads == NULL)
+	return;
+
+    pmDispatch->pmActiveRTThreads(active);
+}
+
+static uint32_t
+pmGetSavedRunCount(void)
+{
+    return(saved_run_count);
+}
+
 /*
  * Returns the root of the package tree.
  */
-static x86_pkg_t *
+x86_pkg_t *
 pmGetPkgRoot(void)
 {
     return(x86_pkgs);
@@ -640,12 +848,60 @@ pmCPUGetHibernate(int cpu)
     return(cpu_datap(cpu)->cpu_hibernate);
 }
 
-static processor_t
+processor_t
 pmLCPUtoProcessor(int lcpu)
 {
     return(cpu_datap(lcpu)->cpu_processor);
 }
 
+static void
+pmReSyncDeadlines(int cpu)
+{
+    static boolean_t	registered	= FALSE;
+
+    if (!registered) {
+	PM_interrupt_register(&timer_resync_deadlines);
+	registered = TRUE;
+    }
+
+    if ((uint32_t)cpu == current_cpu_datap()->lcpu.cpu_num)
+	timer_resync_deadlines();
+    else
+	cpu_PM_interrupt(cpu);
+}
+
+static void
+pmSendIPI(int cpu)
+{
+    lapic_send_ipi(cpu, LAPIC_PM_INTERRUPT);
+}
+
+static void
+pmGetNanotimeInfo(pm_rtc_nanotime_t *rtc_nanotime)
+{
+	/*
+	 * Make sure that nanotime didn't change while we were reading it.
+	 */
+	do {
+		rtc_nanotime->generation = pal_rtc_nanotime_info.generation; /* must be first */
+		rtc_nanotime->tsc_base = pal_rtc_nanotime_info.tsc_base;
+		rtc_nanotime->ns_base = pal_rtc_nanotime_info.ns_base;
+		rtc_nanotime->scale = pal_rtc_nanotime_info.scale;
+		rtc_nanotime->shift = pal_rtc_nanotime_info.shift;
+	} while(pal_rtc_nanotime_info.generation != 0
+		&& rtc_nanotime->generation != pal_rtc_nanotime_info.generation);
+}
+
+uint32_t
+pmTimerQueueMigrate(int target_cpu)
+{
+    /* Call the etimer code to do this. */
+    return (target_cpu != cpu_number())
+		? timer_queue_migrate_cpu(target_cpu)
+		: 0;
+}
+
+
 /*
  * Called by the power management kext to register itself and to get the
  * callbacks it might need into other kernel functions.  This interface
@@ -654,29 +910,57 @@ pmLCPUtoProcessor(int lcpu)
  */
 void
 pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs,
-	       pmCallBacks_t *callbacks)
-{
-    if (callbacks != NULL && version == PM_DISPATCH_VERSION) {
-	callbacks->InitState   = &pmInitState;
-	callbacks->setRTCPop   = setPop;
-	callbacks->resyncDeadlines = etimer_resync_deadlines;
-	callbacks->initComplete= pmInitComplete;
-	callbacks->GetLCPU     = pmGetLogicalCPU;
-	callbacks->GetCore     = pmGetCore;
-	callbacks->GetPackage  = pmGetPackage;
-	callbacks->GetMyLCPU   = pmGetMyLogicalCPU;
-	callbacks->GetMyCore   = pmGetMyCore;
-	callbacks->GetMyPackage= pmGetMyPackage;
-	callbacks->CoresPerPkg = cpuid_info()->cpuid_cores_per_package;
-	callbacks->GetPkgRoot  = pmGetPkgRoot;
-	callbacks->LockCPUTopology = pmLockCPUTopology;
-	callbacks->GetHibernate    = pmCPUGetHibernate;
-	callbacks->LCPUtoProcessor = pmLCPUtoProcessor;
-    }
+    pmCallBacks_t *callbacks)
+{
+	if (callbacks != NULL && version == PM_DISPATCH_VERSION) {
+		callbacks->setRTCPop            = setPop;
+		callbacks->resyncDeadlines      = pmReSyncDeadlines;
+		callbacks->initComplete         = pmInitComplete;
+		callbacks->GetLCPU              = pmGetLogicalCPU;
+		callbacks->GetCore              = pmGetCore;
+		callbacks->GetDie               = pmGetDie;
+		callbacks->GetPackage           = pmGetPackage;
+		callbacks->GetMyLCPU            = pmGetMyLogicalCPU;
+		callbacks->GetMyCore            = pmGetMyCore;
+		callbacks->GetMyDie             = pmGetMyDie;
+		callbacks->GetMyPackage         = pmGetMyPackage;
+		callbacks->GetPkgRoot           = pmGetPkgRoot;
+		callbacks->LockCPUTopology      = pmLockCPUTopology;
+		callbacks->GetHibernate         = pmCPUGetHibernate;
+		callbacks->LCPUtoProcessor      = pmLCPUtoProcessor;
+		callbacks->ThreadBind           = thread_bind;
+		callbacks->GetSavedRunCount     = pmGetSavedRunCount;
+		callbacks->GetNanotimeInfo	= pmGetNanotimeInfo;
+		callbacks->ThreadGetUrgency	= pmThreadGetUrgency;
+		callbacks->RTCClockAdjust	= rtc_clock_adjust;
+		callbacks->timerQueueMigrate    = pmTimerQueueMigrate;
+		callbacks->topoParms            = &topoParms;
+		callbacks->pmSendIPI		= pmSendIPI;
+		callbacks->InterruptPending	= lapic_is_interrupt_pending;
+		callbacks->IsInterrupting	= lapic_is_interrupting;
+		callbacks->InterruptStats	= lapic_interrupt_counts;
+		callbacks->DisableApicTimer	= lapic_disable_timer;
+	} else {
+		panic("Version mis-match between Kernel and CPU PM");
+	}
 
-    if (cpuFuncs != NULL) {
-	pmDispatch = cpuFuncs;
-    }
+	if (cpuFuncs != NULL) {
+		if (pmDispatch) {
+			panic("Attempt to re-register power management interface--AICPM present in xcpm mode? %p->%p", pmDispatch, cpuFuncs);
+		}
+
+		pmDispatch = cpuFuncs;
+
+		if (earlyTopology
+		    && pmDispatch->pmCPUStateInit != NULL) {
+			(*pmDispatch->pmCPUStateInit)();
+			earlyTopology = FALSE;
+		}
+
+		if (pmDispatch->pmIPIHandler != NULL) {
+			lapic_set_pm_func((i386_intr_func_t)pmDispatch->pmIPIHandler);
+		}
+	}
 }
 
 /*
@@ -690,3 +974,16 @@ pmUnRegister(pmDispatch_t *cpuFuncs)
     }
 }
 
+void machine_track_platform_idle(boolean_t entry) {
+	cpu_data_t		*my_cpu		= current_cpu_datap();
+
+	if (entry) {
+		(void)__sync_fetch_and_add(&my_cpu->lcpu.package->num_idle, 1);
+	}
+ 	else {
+ 		uint32_t nidle = __sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1);
+ 		if (nidle == topoParms.nLThreadsPerPackage) {
+ 			my_cpu->lcpu.package->package_idle_exits++;
+ 		}
+ 	}
+}