X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..4d15aeb193b2c68f1d38666c317f8d3734f5f083:/osfmk/i386/pmCPU.c diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index 1f12073fb..5791823d7 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,286 +31,196 @@ * * Implements the "wrappers" to the KEXT. */ -#include -#include -#include -#include #include +#include #include +#include #include +#include +#include +#include #include #include +#include #include #include #include -#include +#include +#include +#include +#include +#include +#include extern int disableConsoleOutput; -decl_simple_lock_data(,pm_init_lock); +#define DELAY_UNSET 0xFFFFFFFFFFFFFFFFULL + +uint64_t cpu_itime_bins[CPU_ITIME_BINS] = {16* NSEC_PER_USEC, 32* NSEC_PER_USEC, 64* NSEC_PER_USEC, 128* NSEC_PER_USEC, 256* NSEC_PER_USEC, 512* NSEC_PER_USEC, 1024* NSEC_PER_USEC, 2048* NSEC_PER_USEC, 4096* NSEC_PER_USEC, 8192* NSEC_PER_USEC, 16384* NSEC_PER_USEC, 32768* NSEC_PER_USEC}; +uint64_t *cpu_rtime_bins = &cpu_itime_bins[0]; /* * The following is set when the KEXT loads and initializes. */ pmDispatch_t *pmDispatch = NULL; -/* - * Current power management states (for use until KEXT is loaded). - */ -static pmInitState_t pmInitState; - -static uint32_t pmInitDone = 0; +uint32_t pmInitDone = 0; +static boolean_t earlyTopology = FALSE; +static uint64_t earlyMaxBusDelay = DELAY_UNSET; +static uint64_t earlyMaxIntDelay = DELAY_UNSET; /* - * Nap control variables: - */ -uint32_t forcenap = 0; /* Force nap (fn) boot-arg controls */ - -/* - * Do any initialization needed - */ -void -pmsInit(void) -{ - static int initialized = 0; - - /* - * Initialize some of the initial state to "uninitialized" until - * it gets set with something more useful. This allows the KEXT - * to determine if the initial value was actually set to something. - */ - if (!initialized) { - pmInitState.PState = -1; - pmInitState.PLimit = -1; - pmInitState.maxBusDelay = -1; - initialized = 1; - } - - if (pmDispatch != NULL && pmDispatch->pmsInit != NULL) - (*pmDispatch->pmsInit)(); -} - -/* - * Start the power management stepper on all processors - * - * All processors must be parked. This should be called when the hardware - * is ready to step. Probably only at boot and after wake from sleep. - * + * Initialize the Cstate change code. */ void -pmsStart(void) +power_management_init(void) { - if (pmDispatch != NULL && pmDispatch->pmsStart != NULL) - (*pmDispatch->pmsStart)(); + if (pmDispatch != NULL && pmDispatch->cstateInit != NULL) + (*pmDispatch->cstateInit)(); } -/* - * Park the stepper execution. This will force the stepper on this - * processor to abandon its current step and stop. No changes to the - * hardware state is made and any previous step is lost. - * - * This is used as the initial state at startup and when the step table - * is being changed. - * - */ -void -pmsPark(void) -{ - if (pmDispatch != NULL && pmDispatch->pmsPark != NULL) - (*pmDispatch->pmsPark)(); +static inline void machine_classify_interval(uint64_t interval, uint64_t *bins, uint64_t *binvals, uint32_t nbins) { + uint32_t i; + for (i = 0; i < nbins; i++) { + if (interval < binvals[i]) { + bins[i]++; + break; + } + } } -/* - * Control the Power Management Stepper. - * Called from user state by the superuser. - * Interrupts disabled. - * - * This interface is deprecated and is now a no-op. - */ -kern_return_t -pmsControl(__unused uint32_t request, __unused user_addr_t reqaddr, - __unused uint32_t reqsize) -{ - return(KERN_SUCCESS); -} +uint64_t idle_pending_timers_processed; +uint32_t idle_entry_timer_processing_hdeadline_threshold = 5000000; /* - * Broadcast a change to all processors including ourselves. - * - * Interrupts disabled. + * Called when the CPU is idle. It calls into the power management kext + * to determine the best way to idle the CPU. */ void -pmsRun(uint32_t nstep) -{ - if (pmDispatch != NULL && pmDispatch->pmsRun != NULL) - (*pmDispatch->pmsRun)(nstep); -} - -/* - * Build the tables needed for the stepper. This includes both the step - * definitions and the step control table. - * - * We most absolutely need to be parked before this happens because we're - * going to change the table. We also have to be complte about checking - * for errors. A copy is always made because we don't want to be crippled - * by not being able to change the table or description formats. - * - * We pass in a table of external functions and the new stepper def uses - * the corresponding indexes rather than actual function addresses. This - * is done so that a proper table can be built with the control syscall. - * It can't supply addresses, so the index has to do. We internalize the - * table so our caller does not need to keep it. Note that passing in a 0 - * will use the current function table. Also note that entry 0 is reserved - * and must be 0, we will check and fail the build. - * - * The platformData parameter is a 32-bit word of data that is passed unaltered - * to the set function. - * - * The queryFunc parameter is the address of a function that will return the - * current state of the platform. The format of the data returned is the same - * as the platform specific portions of pmsSetCmd, i.e., pmsXClk, pmsVoltage, - * and any part of pmsPowerID that is maintained by the platform hardware - * (an example would be the values of the gpios that correspond to pmsPowerID). - * The value should be constructed by querying hardware rather than returning - * a value cached by software. One of the intents of this function is to help - * recover lost or determine initial power states. - * - */ -kern_return_t -pmsBuild(pmsDef *pd, uint32_t pdsize, pmsSetFunc_t *functab, - uint32_t platformData, pmsQueryFunc_t queryFunc) -{ - kern_return_t rc = 0; - - if (pmDispatch != NULL && pmDispatch->pmsBuild != NULL) - rc = (*pmDispatch->pmsBuild)(pd, pdsize, functab, - platformData, queryFunc); - - return(rc); -} - - -/* - * Load a new ratio/VID table. - * - * Note that this interface is specific to the Intel SpeedStep implementation. - * It is expected that this will only be called once to override the default - * ratio/VID table when the platform starts. - * - * Normally, the table will need to be replaced at the same time that the - * stepper program proper is replaced, as the PState indices from an old - * program may no longer be valid. When replacing the default program this - * should not be a problem as any new table will have at least two PState - * entries and the default program only references P0 and P1. - */ -kern_return_t -pmsCPULoadVIDTable(uint16_t *tablep, int nstates) -{ - if (pmDispatch != NULL && pmDispatch->pmsCPULoadVIDTable != NULL) - return((*pmDispatch->pmsCPULoadVIDTable)(tablep, nstates)); - else { - int i; - - if (nstates > MAX_PSTATES) - return(KERN_FAILURE); - - for (i = 0; i < nstates; i += 1) - pmInitState.VIDTable[i] = tablep[i]; - } - return(KERN_SUCCESS); -} - -/* - * Set the (global) PState limit. CPUs will not be permitted to run at - * a lower (more performant) PState than this. - */ -kern_return_t -pmsCPUSetPStateLimit(uint32_t limit) -{ - if (pmDispatch != NULL && pmDispatch->pmsCPUSetPStateLimit != NULL) - return((*pmDispatch->pmsCPUSetPStateLimit)(limit)); - - pmInitState.PLimit = limit; - return(KERN_SUCCESS); -} +machine_idle(void) +{ + cpu_data_t *my_cpu = current_cpu_datap(); + __unused uint32_t cnum = my_cpu->cpu_number; + uint64_t ctime, rtime, itime; +#if CST_DEMOTION_DEBUG + processor_t cproc = my_cpu->cpu_processor; + uint64_t cwakeups = PROCESSOR_DATA(cproc, wakeups_issued_total); +#endif /* CST_DEMOTION_DEBUG */ + uint64_t esdeadline, ehdeadline; + boolean_t do_process_pending_timers = FALSE; + + ctime = mach_absolute_time(); + esdeadline = my_cpu->rtclock_timer.queue.earliest_soft_deadline; + ehdeadline = my_cpu->rtclock_timer.deadline; +/* Determine if pending timers exist */ + if ((ctime >= esdeadline) && (ctime < ehdeadline) && + ((ehdeadline - ctime) < idle_entry_timer_processing_hdeadline_threshold)) { + idle_pending_timers_processed++; + do_process_pending_timers = TRUE; + goto machine_idle_exit; + } else { + TCOAL_DEBUG(0xCCCC0000, ctime, my_cpu->rtclock_timer.queue.earliest_soft_deadline, my_cpu->rtclock_timer.deadline, idle_pending_timers_processed, 0); + } + + my_cpu->lcpu.state = LCPU_IDLE; + DBGLOG(cpu_handle, cpu_number(), MP_IDLE); + MARK_CPU_IDLE(cnum); + + rtime = ctime - my_cpu->cpu_ixtime; + + my_cpu->cpu_rtime_total += rtime; + machine_classify_interval(rtime, &my_cpu->cpu_rtimes[0], &cpu_rtime_bins[0], CPU_RTIME_BINS); +#if CST_DEMOTION_DEBUG + uint32_t cl = 0, ch = 0; + uint64_t c3res, c6res, c7res; + rdmsr_carefully(MSR_IA32_CORE_C3_RESIDENCY, &cl, &ch); + c3res = ((uint64_t)ch << 32) | cl; + rdmsr_carefully(MSR_IA32_CORE_C6_RESIDENCY, &cl, &ch); + c6res = ((uint64_t)ch << 32) | cl; + rdmsr_carefully(MSR_IA32_CORE_C7_RESIDENCY, &cl, &ch); + c7res = ((uint64_t)ch << 32) | cl; +#endif + + if (pmInitDone) { + /* + * Handle case where ml_set_maxbusdelay() or ml_set_maxintdelay() + * were called prior to the CPU PM kext being registered. We do + * this here since we know at this point the values will be first + * used since idle is where the decisions using these values is made. + */ + if (earlyMaxBusDelay != DELAY_UNSET) + ml_set_maxbusdelay((uint32_t)(earlyMaxBusDelay & 0xFFFFFFFF)); + if (earlyMaxIntDelay != DELAY_UNSET) + ml_set_maxintdelay(earlyMaxIntDelay); + } -/* - * Initialize the Cstate change code. - */ -void -power_management_init(void) -{ - static boolean_t initialized = FALSE; + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->MachineIdle != NULL) + (*pmDispatch->MachineIdle)(0x7FFFFFFFFFFFFFFFULL); + else { + /* + * If no power management, re-enable interrupts and halt. + * This will keep the CPU from spinning through the scheduler + * and will allow at least some minimal power savings (but it + * cause problems in some MP configurations w.r.t. the APIC + * stopping during a GV3 transition). + */ + pal_hlt(); + /* Once woken, re-disable interrupts. */ + pal_cli(); + } - /* - * Initialize the lock for the KEXT initialization. - */ - if (!initialized) { - simple_lock_init(&pm_init_lock, 0); - initialized = TRUE; - } + /* + * Mark the CPU as running again. + */ + MARK_CPU_ACTIVE(cnum); + DBGLOG(cpu_handle, cnum, MP_UNIDLE); + my_cpu->lcpu.state = LCPU_RUN; + uint64_t ixtime = my_cpu->cpu_ixtime = mach_absolute_time(); + itime = ixtime - ctime; + my_cpu->cpu_idle_exits++; + my_cpu->cpu_itime_total += itime; + machine_classify_interval(itime, &my_cpu->cpu_itimes[0], &cpu_itime_bins[0], CPU_ITIME_BINS); +#if CST_DEMOTION_DEBUG + cl = ch = 0; + rdmsr_carefully(MSR_IA32_CORE_C3_RESIDENCY, &cl, &ch); + c3res = (((uint64_t)ch << 32) | cl) - c3res; + rdmsr_carefully(MSR_IA32_CORE_C6_RESIDENCY, &cl, &ch); + c6res = (((uint64_t)ch << 32) | cl) - c6res; + rdmsr_carefully(MSR_IA32_CORE_C7_RESIDENCY, &cl, &ch); + c7res = (((uint64_t)ch << 32) | cl) - c7res; + + uint64_t ndelta = itime - tmrCvt(c3res + c6res + c7res, tscFCvtt2n); + KERNEL_DEBUG_CONSTANT(0xcead0000, ndelta, itime, c7res, c6res, c3res); + if ((itime > 1000000) && (ndelta > 250000)) + KERNEL_DEBUG_CONSTANT(0xceae0000, ndelta, itime, c7res, c6res, c3res); +#endif + + machine_idle_exit: + /* + * Re-enable interrupts. + */ - if (pmDispatch != NULL && pmDispatch->cstateInit != NULL) - (*pmDispatch->cstateInit)(); -} + pal_sti(); -/* - * ACPI calls the following routine to set/update mwait hints. A table - * (possibly null) specifies the available Cstates and their hints, all - * other states are assumed to be invalid. ACPI may update available - * states to change the nap policy (for example, while AC power is - * available). - */ -kern_return_t -Cstate_table_set(Cstate_hint_t *tablep, unsigned int nstates) -{ - if (forcenap) - return(KERN_SUCCESS); + if (do_process_pending_timers) { + TCOAL_DEBUG(0xBBBB0000 | DBG_FUNC_START, ctime, esdeadline, ehdeadline, idle_pending_timers_processed, 0); - if (pmDispatch != NULL && pmDispatch->cstateTableSet != NULL) - return((*pmDispatch->cstateTableSet)(tablep, nstates)); - else { - unsigned int i; + /* Adjust to reflect that this isn't truly a package idle exit */ + __sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1); + lapic_timer_swi(); /* Trigger software timer interrupt */ + __sync_fetch_and_add(&my_cpu->lcpu.package->num_idle, 1); - for (i = 0; i < nstates; i += 1) { - pmInitState.CStates[i].number = tablep[i].number; - pmInitState.CStates[i].hint = tablep[i].hint; + TCOAL_DEBUG(0xBBBB0000 | DBG_FUNC_END, ctime, esdeadline, idle_pending_timers_processed, 0, 0); } +#if CST_DEMOTION_DEBUG + uint64_t nwakeups = PROCESSOR_DATA(cproc, wakeups_issued_total); - pmInitState.CStatesCount = nstates; - } - return(KERN_SUCCESS); -} - -/* - * Called when the CPU is idle. It will choose the best C state to - * be in. - */ -void -machine_idle_cstate(boolean_t halted) -{ - if (pmInitDone - && pmDispatch != NULL - && pmDispatch->cstateMachineIdle != NULL) - (*pmDispatch->cstateMachineIdle)(!halted ? - 0x7FFFFFFFFFFFFFFFULL : 0ULL); - else if (halted) { - /* - * If no power managment and a processor is taken off-line, - * then invalidate the cache and halt it (it will not be able - * to be brought back on-line without resetting the CPU). - */ - __asm__ volatile ( "wbinvd; hlt" ); - } else { - /* - * If no power management, re-enable interrupts and halt. - * This will keep the CPU from spinning through the scheduler - * and will allow at least some minimal power savings (but it - * may cause problems in some MP configurations w.r.t to the - * APIC stopping during a P-State transition). - */ - __asm__ volatile ( "sti; hlt" ); + if ((nwakeups == cwakeups) && (topoParms.nLThreadsPerPackage == my_cpu->lcpu.package->num_idle)) { + KERNEL_DEBUG_CONSTANT(0xceaa0000, cwakeups, 0, 0, 0, 0); } +#endif } /* @@ -320,63 +230,87 @@ machine_idle_cstate(boolean_t halted) void pmCPUHalt(uint32_t reason) { + cpu_data_t *cpup = current_cpu_datap(); switch (reason) { case PM_HALT_DEBUG: - __asm__ volatile ("wbinvd; hlt"); + cpup->lcpu.state = LCPU_PAUSE; + pal_stop_cpu(FALSE); break; case PM_HALT_PANIC: - __asm__ volatile ("cli; wbinvd; hlt"); + cpup->lcpu.state = LCPU_PAUSE; + pal_stop_cpu(TRUE); break; case PM_HALT_NORMAL: + case PM_HALT_SLEEP: default: - __asm__ volatile ("cli"); + pal_cli(); if (pmInitDone && pmDispatch != NULL && pmDispatch->pmCPUHalt != NULL) { + /* + * Halt the CPU (and put it in a low power state. + */ (*pmDispatch->pmCPUHalt)(); - } else { - cpu_data_t *cpup = current_cpu_datap(); + /* + * We've exited halt, so get the CPU schedulable again. + * - by calling the fast init routine for a slave, or + * - by returning if we're the master processor. + */ + if (cpup->cpu_number != master_cpu) { + i386_init_slave_fast(); + panic("init_slave_fast returned"); + } + } else + { /* * If no power managment and a processor is taken off-line, * then invalidate the cache and halt it (it will not be able * to be brought back on-line without resetting the CPU). */ __asm__ volatile ("wbinvd"); - cpup->lcpu.halted = TRUE; - __asm__ volatile ( "wbinvd; hlt" ); + cpup->lcpu.state = LCPU_HALT; + pal_stop_cpu(FALSE); + + panic("back from Halt"); } + break; } } -/* - * Called to initialize the power management structures for the CPUs. - */ void -pmCPUStateInit(void) +pmMarkAllCPUsOff(void) { - if (pmDispatch != NULL && pmDispatch->pmCPUStateInit != NULL) - (*pmDispatch->pmCPUStateInit)(); + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->markAllCPUsOff != NULL) + (*pmDispatch->markAllCPUsOff)(); } static void pmInitComplete(void) { + if (earlyTopology + && pmDispatch != NULL + && pmDispatch->pmCPUStateInit != NULL) { + (*pmDispatch->pmCPUStateInit)(); + earlyTopology = FALSE; + } pmInitDone = 1; } -static x86_lcpu_t * +x86_lcpu_t * pmGetLogicalCPU(int cpu) { return(cpu_to_lcpu(cpu)); } -static x86_lcpu_t * +x86_lcpu_t * pmGetMyLogicalCPU(void) { cpu_data_t *cpup = current_cpu_datap(); @@ -398,6 +332,20 @@ pmGetMyCore(void) return(cpup->lcpu.core); } +static x86_die_t * +pmGetDie(int cpu) +{ + return(cpu_to_die(cpu)); +} + +static x86_die_t * +pmGetMyDie(void) +{ + cpu_data_t *cpup = current_cpu_datap(); + + return(cpup->lcpu.die); +} + static x86_pkg_t * pmGetPackage(int cpu) { @@ -409,7 +357,7 @@ pmGetMyPackage(void) { cpu_data_t *cpup = current_cpu_datap(); - return(cpup->lcpu.core->package); + return(cpup->lcpu.package); } static void @@ -425,11 +373,13 @@ pmLockCPUTopology(int lock) /* * Called to get the next deadline that has been set by the * power management code. + * Note: a return of 0 from AICPM and this routine signifies + * that no deadline is set. */ uint64_t pmCPUGetDeadline(cpu_data_t *cpu) { - uint64_t deadline = EndOfAllTime; + uint64_t deadline = 0; if (pmInitDone && pmDispatch != NULL @@ -443,10 +393,11 @@ pmCPUGetDeadline(cpu_data_t *cpu) * Called to determine if the supplied deadline or the power management * deadline is sooner. Returns which ever one is first. */ + uint64_t pmCPUSetDeadline(cpu_data_t *cpu, uint64_t deadline) { - if (pmInitDone + if (pmInitDone && pmDispatch != NULL && pmDispatch->SetDeadline != NULL) deadline = (*pmDispatch->SetDeadline)(&cpu->lcpu, deadline); @@ -484,29 +435,58 @@ pmCPUExitIdle(cpu_data_t *cpu) return(do_ipi); } +kern_return_t +pmCPUExitHalt(int cpu) +{ + kern_return_t rc = KERN_INVALID_ARGUMENT; + + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->exitHalt != NULL) + rc = pmDispatch->exitHalt(cpu_to_lcpu(cpu)); + + return(rc); +} + +kern_return_t +pmCPUExitHaltToOff(int cpu) +{ + kern_return_t rc = KERN_SUCCESS; + + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->exitHaltToOff != NULL) + rc = pmDispatch->exitHaltToOff(cpu_to_lcpu(cpu)); + + return(rc); +} + /* - * Called when a CPU is being restarted after being powered off (as in S3). + * Called to initialize the power management structures for the CPUs. */ void -pmCPUMarkRunning(cpu_data_t *cpu) +pmCPUStateInit(void) { - if (pmInitDone - && pmDispatch != NULL - && pmDispatch->markCPURunning != NULL) - (*pmDispatch->markCPURunning)(&cpu->lcpu); + if (pmDispatch != NULL && pmDispatch->pmCPUStateInit != NULL) + (*pmDispatch->pmCPUStateInit)(); + else + earlyTopology = TRUE; } /* - * Called from the HPET interrupt handler to perform the - * necessary power management work. + * Called when a CPU is being restarted after being powered off (as in S3). */ void -pmHPETInterrupt(void) +pmCPUMarkRunning(cpu_data_t *cpu) { + cpu_data_t *cpup = current_cpu_datap(); + if (pmInitDone && pmDispatch != NULL - && pmDispatch->HPETInterrupt != NULL) - (*pmDispatch->HPETInterrupt)(); + && pmDispatch->markCPURunning != NULL) + (*pmDispatch->markCPURunning)(&cpu->lcpu); + else + cpup->lcpu.state = LCPU_RUN; } /* @@ -524,6 +504,30 @@ pmCPUControl(uint32_t cmd, void *datap) return(rc); } +/* + * Called to save the timer state used by power management prior + * to "sleeping". + */ +void +pmTimerSave(void) +{ + if (pmDispatch != NULL + && pmDispatch->pmTimerStateSave != NULL) + (*pmDispatch->pmTimerStateSave)(); +} + +/* + * Called to restore the timer state used by power management after + * waking from "sleep". + */ +void +pmTimerRestore(void) +{ + if (pmDispatch != NULL + && pmDispatch->pmTimerStateRestore != NULL) + (*pmDispatch->pmTimerStateRestore)(); +} + /* * Set the worst-case time for the C4 to C2 transition. * No longer does anything. @@ -542,7 +546,8 @@ ml_get_maxsnoop(void) { uint64_t max_snoop = 0; - if (pmDispatch != NULL + if (pmInitDone + && pmDispatch != NULL && pmDispatch->getMaxSnoop != NULL) max_snoop = pmDispatch->getMaxSnoop(); @@ -555,7 +560,8 @@ ml_get_maxbusdelay(void) { uint64_t max_delay = 0; - if (pmDispatch != NULL + if (pmInitDone + && pmDispatch != NULL && pmDispatch->getMaxBusDelay != NULL) max_delay = pmDispatch->getMaxBusDelay(); @@ -563,12 +569,7 @@ ml_get_maxbusdelay(void) } /* - * Set the maximum delay time allowed for snoop on the bus. - * - * Note that this value will be compared to the amount of time that it takes - * to transition from a non-snooping power state (C4) to a snooping state (C2). - * If maxBusDelay is less than C4C2SnoopDelay, - * we will not enter the lowest power state. + * Advertise a memory access latency tolerance of "mdelay" ns */ void ml_set_maxbusdelay(uint32_t mdelay) @@ -576,10 +577,50 @@ ml_set_maxbusdelay(uint32_t mdelay) uint64_t maxdelay = mdelay; if (pmDispatch != NULL - && pmDispatch->setMaxBusDelay != NULL) + && pmDispatch->setMaxBusDelay != NULL) { + earlyMaxBusDelay = DELAY_UNSET; pmDispatch->setMaxBusDelay(maxdelay); - else - pmInitState.maxBusDelay = maxdelay; + } else + earlyMaxBusDelay = maxdelay; +} + +uint64_t +ml_get_maxintdelay(void) +{ + uint64_t max_delay = 0; + + if (pmDispatch != NULL + && pmDispatch->getMaxIntDelay != NULL) + max_delay = pmDispatch->getMaxIntDelay(); + + return(max_delay); +} + +/* + * Set the maximum delay allowed for an interrupt. + */ +void +ml_set_maxintdelay(uint64_t mdelay) +{ + if (pmDispatch != NULL + && pmDispatch->setMaxIntDelay != NULL) { + earlyMaxIntDelay = DELAY_UNSET; + pmDispatch->setMaxIntDelay(mdelay); + } else + earlyMaxIntDelay = mdelay; +} + +boolean_t +ml_get_interrupt_prewake_applicable() +{ + boolean_t applicable = FALSE; + + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->pmInterruptPrewakeApplicable != NULL) + applicable = pmDispatch->pmInterruptPrewakeApplicable(); + + return applicable; } /* @@ -602,15 +643,14 @@ pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags) * We only look at the PAUSE and RESUME flags. The other flag(s) * will not make any sense without the KEXT, so just ignore them. * - * We set the halted flag in the LCPU structure to indicate - * that this CPU isn't to do anything. If it's the CPU we're - * currently running on, then spin until the halted flag is - * reset. + * We set the CPU's state to indicate that it's halted. If this + * is the CPU we're currently running on, then spin until the + * state becomes non-halted. */ if (flags & PM_SAFE_FL_PAUSE) { - lcpu->halted = TRUE; + lcpu->state = LCPU_PAUSE; if (lcpu == x86_lcpu()) { - while (lcpu->halted) + while (lcpu->state == LCPU_PAUSE) cpu_pause(); } } @@ -620,15 +660,183 @@ pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags) * get it out of it's spin loop. */ if (flags & PM_SAFE_FL_RESUME) { - lcpu->halted = FALSE; + lcpu->state = LCPU_RUN; } } } +static uint32_t saved_run_count = 0; + +void +machine_run_count(uint32_t count) +{ + if (pmDispatch != NULL + && pmDispatch->pmSetRunCount != NULL) + pmDispatch->pmSetRunCount(count); + else + saved_run_count = count; +} + +processor_t +machine_choose_processor(processor_set_t pset, + processor_t preferred) +{ + int startCPU; + int endCPU; + int preferredCPU; + int chosenCPU; + + if (!pmInitDone) + return(preferred); + + if (pset == NULL) { + startCPU = -1; + endCPU = -1; + } else { + startCPU = pset->cpu_set_low; + endCPU = pset->cpu_set_hi; + } + + if (preferred == NULL) + preferredCPU = -1; + else + preferredCPU = preferred->cpu_id; + + if (pmDispatch != NULL + && pmDispatch->pmChooseCPU != NULL) { + chosenCPU = pmDispatch->pmChooseCPU(startCPU, endCPU, preferredCPU); + + if (chosenCPU == -1) + return(NULL); + return(cpu_datap(chosenCPU)->cpu_processor); + } + + return(preferred); +} + +static int +pmThreadGetUrgency(uint64_t *rt_period, uint64_t *rt_deadline) +{ + int urgency; + uint64_t arg1, arg2; + + urgency = thread_get_urgency(current_processor()->next_thread, &arg1, &arg2); + + if (urgency == THREAD_URGENCY_REAL_TIME) { + if (rt_period != NULL) + *rt_period = arg1; + + if (rt_deadline != NULL) + *rt_deadline = arg2; + } + + return(urgency); +} + +#if DEBUG +uint32_t urgency_stats[64][THREAD_URGENCY_MAX]; +#endif + +#define URGENCY_NOTIFICATION_ASSERT_NS (5 * 1000 * 1000) +uint64_t urgency_notification_assert_abstime_threshold, urgency_notification_max_recorded; + +void +thread_tell_urgency(int urgency, + uint64_t rt_period, + uint64_t rt_deadline, + uint64_t sched_latency, + thread_t nthread) +{ + uint64_t urgency_notification_time_start, delta; + boolean_t urgency_assert = (urgency_notification_assert_abstime_threshold != 0); + assert(get_preemption_level() > 0 || ml_get_interrupts_enabled() == FALSE); +#if DEBUG + urgency_stats[cpu_number() % 64][urgency]++; +#endif + if (!pmInitDone + || pmDispatch == NULL + || pmDispatch->pmThreadTellUrgency == NULL) + return; + + SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0); + + if (__improbable((urgency_assert == TRUE))) + urgency_notification_time_start = mach_absolute_time(); + + current_cpu_datap()->cpu_nthread = nthread; + pmDispatch->pmThreadTellUrgency(urgency, rt_period, rt_deadline); + + if (__improbable((urgency_assert == TRUE))) { + delta = mach_absolute_time() - urgency_notification_time_start; + + if (__improbable(delta > urgency_notification_max_recorded)) { + /* This is not synchronized, but it doesn't matter + * if we (rarely) miss an event, as it is statistically + * unlikely that it will never recur. + */ + urgency_notification_max_recorded = delta; + + if (__improbable((delta > urgency_notification_assert_abstime_threshold) && !machine_timeout_suspended())) + panic("Urgency notification callout %p exceeded threshold, 0x%llx abstime units", pmDispatch->pmThreadTellUrgency, delta); + } + } + + SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0); +} + +void +machine_thread_going_on_core(__unused thread_t new_thread, + __unused int urgency, + __unused uint64_t sched_latency, + __unused uint64_t dispatch_time) +{ +} + +void +machine_thread_going_off_core(__unused thread_t old_thread, __unused boolean_t thread_terminating, __unused uint64_t last_dispatch) +{ +} + +void +machine_max_runnable_latency(__unused uint64_t bg_max_latency, + __unused uint64_t default_max_latency, + __unused uint64_t realtime_max_latency) +{ +} + +void +machine_work_interval_notify(__unused thread_t thread, + __unused uint64_t work_interval_id, + __unused uint64_t start_abstime, + __unused uint64_t finish_abstime, + __unused uint64_t deadline_abstime, + __unused uint64_t next_start_abstime, + __unused uint16_t urgency, + __unused uint32_t flags) +{ +} + +void +active_rt_threads(boolean_t active) +{ + if (!pmInitDone + || pmDispatch == NULL + || pmDispatch->pmActiveRTThreads == NULL) + return; + + pmDispatch->pmActiveRTThreads(active); +} + +static uint32_t +pmGetSavedRunCount(void) +{ + return(saved_run_count); +} + /* * Returns the root of the package tree. */ -static x86_pkg_t * +x86_pkg_t * pmGetPkgRoot(void) { return(x86_pkgs); @@ -640,12 +848,60 @@ pmCPUGetHibernate(int cpu) return(cpu_datap(cpu)->cpu_hibernate); } -static processor_t +processor_t pmLCPUtoProcessor(int lcpu) { return(cpu_datap(lcpu)->cpu_processor); } +static void +pmReSyncDeadlines(int cpu) +{ + static boolean_t registered = FALSE; + + if (!registered) { + PM_interrupt_register(&timer_resync_deadlines); + registered = TRUE; + } + + if ((uint32_t)cpu == current_cpu_datap()->lcpu.cpu_num) + timer_resync_deadlines(); + else + cpu_PM_interrupt(cpu); +} + +static void +pmSendIPI(int cpu) +{ + lapic_send_ipi(cpu, LAPIC_PM_INTERRUPT); +} + +static void +pmGetNanotimeInfo(pm_rtc_nanotime_t *rtc_nanotime) +{ + /* + * Make sure that nanotime didn't change while we were reading it. + */ + do { + rtc_nanotime->generation = pal_rtc_nanotime_info.generation; /* must be first */ + rtc_nanotime->tsc_base = pal_rtc_nanotime_info.tsc_base; + rtc_nanotime->ns_base = pal_rtc_nanotime_info.ns_base; + rtc_nanotime->scale = pal_rtc_nanotime_info.scale; + rtc_nanotime->shift = pal_rtc_nanotime_info.shift; + } while(pal_rtc_nanotime_info.generation != 0 + && rtc_nanotime->generation != pal_rtc_nanotime_info.generation); +} + +uint32_t +pmTimerQueueMigrate(int target_cpu) +{ + /* Call the etimer code to do this. */ + return (target_cpu != cpu_number()) + ? timer_queue_migrate_cpu(target_cpu) + : 0; +} + + /* * Called by the power management kext to register itself and to get the * callbacks it might need into other kernel functions. This interface @@ -654,29 +910,57 @@ pmLCPUtoProcessor(int lcpu) */ void pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs, - pmCallBacks_t *callbacks) -{ - if (callbacks != NULL && version == PM_DISPATCH_VERSION) { - callbacks->InitState = &pmInitState; - callbacks->setRTCPop = setPop; - callbacks->resyncDeadlines = etimer_resync_deadlines; - callbacks->initComplete= pmInitComplete; - callbacks->GetLCPU = pmGetLogicalCPU; - callbacks->GetCore = pmGetCore; - callbacks->GetPackage = pmGetPackage; - callbacks->GetMyLCPU = pmGetMyLogicalCPU; - callbacks->GetMyCore = pmGetMyCore; - callbacks->GetMyPackage= pmGetMyPackage; - callbacks->CoresPerPkg = cpuid_info()->cpuid_cores_per_package; - callbacks->GetPkgRoot = pmGetPkgRoot; - callbacks->LockCPUTopology = pmLockCPUTopology; - callbacks->GetHibernate = pmCPUGetHibernate; - callbacks->LCPUtoProcessor = pmLCPUtoProcessor; - } + pmCallBacks_t *callbacks) +{ + if (callbacks != NULL && version == PM_DISPATCH_VERSION) { + callbacks->setRTCPop = setPop; + callbacks->resyncDeadlines = pmReSyncDeadlines; + callbacks->initComplete = pmInitComplete; + callbacks->GetLCPU = pmGetLogicalCPU; + callbacks->GetCore = pmGetCore; + callbacks->GetDie = pmGetDie; + callbacks->GetPackage = pmGetPackage; + callbacks->GetMyLCPU = pmGetMyLogicalCPU; + callbacks->GetMyCore = pmGetMyCore; + callbacks->GetMyDie = pmGetMyDie; + callbacks->GetMyPackage = pmGetMyPackage; + callbacks->GetPkgRoot = pmGetPkgRoot; + callbacks->LockCPUTopology = pmLockCPUTopology; + callbacks->GetHibernate = pmCPUGetHibernate; + callbacks->LCPUtoProcessor = pmLCPUtoProcessor; + callbacks->ThreadBind = thread_bind; + callbacks->GetSavedRunCount = pmGetSavedRunCount; + callbacks->GetNanotimeInfo = pmGetNanotimeInfo; + callbacks->ThreadGetUrgency = pmThreadGetUrgency; + callbacks->RTCClockAdjust = rtc_clock_adjust; + callbacks->timerQueueMigrate = pmTimerQueueMigrate; + callbacks->topoParms = &topoParms; + callbacks->pmSendIPI = pmSendIPI; + callbacks->InterruptPending = lapic_is_interrupt_pending; + callbacks->IsInterrupting = lapic_is_interrupting; + callbacks->InterruptStats = lapic_interrupt_counts; + callbacks->DisableApicTimer = lapic_disable_timer; + } else { + panic("Version mis-match between Kernel and CPU PM"); + } - if (cpuFuncs != NULL) { - pmDispatch = cpuFuncs; - } + if (cpuFuncs != NULL) { + if (pmDispatch) { + panic("Attempt to re-register power management interface--AICPM present in xcpm mode? %p->%p", pmDispatch, cpuFuncs); + } + + pmDispatch = cpuFuncs; + + if (earlyTopology + && pmDispatch->pmCPUStateInit != NULL) { + (*pmDispatch->pmCPUStateInit)(); + earlyTopology = FALSE; + } + + if (pmDispatch->pmIPIHandler != NULL) { + lapic_set_pm_func((i386_intr_func_t)pmDispatch->pmIPIHandler); + } + } } /* @@ -690,3 +974,16 @@ pmUnRegister(pmDispatch_t *cpuFuncs) } } +void machine_track_platform_idle(boolean_t entry) { + cpu_data_t *my_cpu = current_cpu_datap(); + + if (entry) { + (void)__sync_fetch_and_add(&my_cpu->lcpu.package->num_idle, 1); + } + else { + uint32_t nidle = __sync_fetch_and_sub(&my_cpu->lcpu.package->num_idle, 1); + if (nidle == topoParms.nLThreadsPerPackage) { + my_cpu->lcpu.package->package_idle_exits++; + } + } +}