[apple/xnu.git] / osfmk / i386 / i386_timer.c

/*
 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/*
 * @OSF_COPYRIGHT@
 */
/*
 * @APPLE_FREE_COPYRIGHT@
 */
/*
 *	File:		timer.c
 *	Purpose:	Routines for handling the machine independent timer.
 */

#include <mach/mach_types.h>

#include <kern/timer_queue.h>
#include <kern/timer_call.h>
#include <kern/clock.h>
#include <kern/thread.h>
#include <kern/processor.h>
#include <kern/macro_help.h>
#include <kern/spl.h>
#include <kern/timer_queue.h>
#include <kern/pms.h>

#include <machine/commpage.h>
#include <machine/machine_routines.h>

#include <sys/kdebug.h>
#include <i386/cpu_data.h>
#include <i386/cpu_topology.h>
#include <i386/cpu_threads.h>

uint32_t spurious_timers;

/*
 * 	Event timer interrupt.
 *
 * XXX a drawback of this implementation is that events serviced earlier must not set deadlines
 *     that occur before the entire chain completes.
 *
 * XXX a better implementation would use a set of generic callouts and iterate over them
 */
void
timer_intr(int		user_mode,
	    uint64_t	rip)
{
	uint64_t		abstime;
	rtclock_timer_t		*mytimer;
	cpu_data_t		*pp;
	int64_t			latency;
	uint64_t		pmdeadline;
	boolean_t		timer_processed = FALSE;

	pp = current_cpu_datap();

	SCHED_STATS_TIMER_POP(current_processor());

	abstime = mach_absolute_time();		/* Get the time now */

	/* has a pending clock timer expired? */
	mytimer = &pp->rtclock_timer;		/* Point to the event timer */

	if ((timer_processed = ((mytimer->deadline <= abstime) ||
		    (abstime >= (mytimer->queue.earliest_soft_deadline))))) {
		/*
		 * Log interrupt service latency (-ve value expected by tool)
		 * a non-PM event is expected next.
		 * The requested deadline may be earlier than when it was set 
		 * - use MAX to avoid reporting bogus latencies.
		 */
		latency = (int64_t) (abstime - MAX(mytimer->deadline,
						   mytimer->when_set));
		/* Log zero timer latencies when opportunistically processing
		 * coalesced timers.
		 */
		if (latency < 0) {
			TCOAL_DEBUG(0xEEEE0000, abstime, mytimer->queue.earliest_soft_deadline, abstime - mytimer->queue.earliest_soft_deadline, 0, 0);
			latency = 0;
		}

		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
			DECR_TRAP_LATENCY | DBG_FUNC_NONE,
			-latency,
			((user_mode != 0) ? rip : VM_KERNEL_UNSLIDE(rip)),
			user_mode, 0, 0);

		mytimer->has_expired = TRUE;	/* Remember that we popped */
		mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime);
		mytimer->has_expired = FALSE;

		/* Get the time again since we ran a bit */
		abstime = mach_absolute_time();
		mytimer->when_set = abstime;
	}

	/* is it time for power management state change? */
	if ((pmdeadline = pmCPUGetDeadline(pp)) && (pmdeadline <= abstime)) {
		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
			DECR_PM_DEADLINE | DBG_FUNC_START,
			0, 0, 0, 0, 0);
		pmCPUDeadline(pp);
		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
			DECR_PM_DEADLINE | DBG_FUNC_END,
			0, 0, 0, 0, 0);
		timer_processed = TRUE;
	}

	/* schedule our next deadline */
	x86_lcpu()->rtcDeadline = EndOfAllTime;
	timer_resync_deadlines();

	if (__improbable(timer_processed == FALSE))
		spurious_timers++;
}

/*
 * Set the clock deadline.
 */
void timer_set_deadline(uint64_t deadline)
{
	rtclock_timer_t		*mytimer;
	spl_t			s;
	cpu_data_t		*pp;

	s = splclock();				/* no interruptions */
	pp = current_cpu_datap();

	mytimer = &pp->rtclock_timer;		/* Point to the timer itself */
	mytimer->deadline = deadline;		/* Set new expiration time */
	mytimer->when_set = mach_absolute_time();

	timer_resync_deadlines();

	splx(s);
}

/*
 * Re-evaluate the outstanding deadlines and select the most proximate.
 *
 * Should be called at splclock.
 */
void
timer_resync_deadlines(void)
{
	uint64_t		deadline = EndOfAllTime;
	uint64_t		pmdeadline;
	rtclock_timer_t		*mytimer;
	spl_t			s = splclock();
	cpu_data_t		*pp;
	uint32_t		decr;

	pp = current_cpu_datap();
	if (!pp->cpu_running)
		/* There's really nothing to do if this processor is down */
		return;

	/*
	 * If we have a clock timer set, pick that.
	 */
	mytimer = &pp->rtclock_timer;
	if (!mytimer->has_expired &&
	    0 < mytimer->deadline && mytimer->deadline < EndOfAllTime)
		deadline = mytimer->deadline;

	/*
	 * If we have a power management deadline, see if that's earlier.
	 */
	pmdeadline = pmCPUGetDeadline(pp);
	if (0 < pmdeadline && pmdeadline < deadline)
		deadline = pmdeadline;

	/*
	 * Go and set the "pop" event.
	 */
	decr = (uint32_t) setPop(deadline);

	/* Record non-PM deadline for latency tool */
	if (decr != 0 && deadline != pmdeadline) {
		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
			DECR_SET_DEADLINE | DBG_FUNC_NONE,
			decr, 2,
			deadline,
			mytimer->queue.count, 0);
	}
	splx(s);
}

void
timer_queue_expire_local(
__unused void			*arg)
{
	rtclock_timer_t		*mytimer;
	uint64_t			abstime;
	cpu_data_t			*pp;

	pp = current_cpu_datap();

	mytimer = &pp->rtclock_timer;
	abstime = mach_absolute_time();

	mytimer->has_expired = TRUE;
	mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime);
	mytimer->has_expired = FALSE;
	mytimer->when_set = mach_absolute_time();

	timer_resync_deadlines();
}

void
timer_queue_expire_rescan(
__unused void			*arg)
{
	rtclock_timer_t		*mytimer;
	uint64_t		abstime;
	cpu_data_t		*pp;

	assert(ml_get_interrupts_enabled() == FALSE);
	pp = current_cpu_datap();

	mytimer = &pp->rtclock_timer;
	abstime = mach_absolute_time();

	mytimer->has_expired = TRUE;
	mytimer->deadline = timer_queue_expire_with_options(&mytimer->queue, abstime, TRUE);
	mytimer->has_expired = FALSE;
	mytimer->when_set = mach_absolute_time();

	timer_resync_deadlines();
}

/* N.B.: Max leeway values assume 1GHz timebase */
timer_coalescing_priority_params_t tcoal_prio_params =
{
	/* Deadline scale values for each thread attribute */
	0, -5, 3, 3, 3,
	/* Maximum leeway in abstime for each thread attribute */
	0ULL, 100*NSEC_PER_MSEC, NSEC_PER_MSEC, NSEC_PER_MSEC, NSEC_PER_MSEC,
	/* Deadline scale values for each latency QoS tier */
	{3, 2, 1, -2, -15, -15},
	/* Maximum leeway in abstime for each latency QoS Tier*/
	{1*NSEC_PER_MSEC, 5*NSEC_PER_MSEC, 20*NSEC_PER_MSEC, 75*NSEC_PER_MSEC,
	 10*NSEC_PER_SEC, 10*NSEC_PER_SEC},
	/* Signifies that the tier requires rate-limiting */
	{FALSE, FALSE, FALSE, FALSE, TRUE, TRUE}
};
#define TIMER_RESORT_THRESHOLD_ABSTIME (50 * NSEC_PER_MSEC)

#if TCOAL_PRIO_STATS
int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
#define TCOAL_PRIO_STAT(x) (x++)
#else
#define TCOAL_PRIO_STAT(x)
#endif

/* Select timer coalescing window based on per-task quality-of-service hints */
static boolean_t tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax, boolean_t *pratelimited) {
	uint32_t latency_qos;
	boolean_t adjusted = FALSE;
	task_t ctask = t->task;

	if (ctask) {
		latency_qos = proc_get_effective_task_policy(ctask, TASK_POLICY_LATENCY_QOS);

		assert(latency_qos <= NUM_LATENCY_QOS_TIERS);

		if (latency_qos) {
			*tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
			*tmax = tcoal_prio_params.latency_qos_ns_max[latency_qos - 1];
			*pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
			adjusted = TRUE;
		}
	}
	return adjusted;
}

/* Adjust timer deadlines based on priority of the thread and the
 * urgency value provided at timeout establishment. With this mechanism,
 * timers are no longer necessarily sorted in order of soft deadline
 * on a given timer queue, i.e. they may be differentially skewed.
 * In the current scheme, this could lead to fewer pending timers
 * processed than is technically possible when the HW deadline arrives.
 */
static void
timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax, boolean_t *pratelimited) {
	int16_t tpri = cthread->sched_pri;

	if ((urgency & TIMER_CALL_USER_MASK) != 0) {
		if (tpri >= BASEPRI_RTQUEUES ||
		    urgency == TIMER_CALL_USER_CRITICAL) {
			*tshift = tcoal_prio_params.timer_coalesce_rt_shift;
			*tmax = tcoal_prio_params.timer_coalesce_rt_ns_max;
			TCOAL_PRIO_STAT(rt_tcl);
		} else if ((urgency == TIMER_CALL_USER_BACKGROUND) ||
		    proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG)) {
			/* Determine if timer should be subjected to a lower QoS */
			if (tcoal_qos_adjust(cthread, tshift, tmax, pratelimited)) {
				if (*tmax > tcoal_prio_params.timer_coalesce_bg_ns_max) {
					return;
				} else {
					*pratelimited = FALSE;
				}
			}
			*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
			*tmax = tcoal_prio_params.timer_coalesce_bg_ns_max;
			TCOAL_PRIO_STAT(bg_tcl);
		} else if (tpri >= MINPRI_KERNEL) {
			*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
			*tmax = tcoal_prio_params.timer_coalesce_kt_ns_max;
			TCOAL_PRIO_STAT(kt_tcl);
		} else if (cthread->sched_mode == TH_MODE_FIXED) {
			*tshift = tcoal_prio_params.timer_coalesce_fp_shift;
			*tmax = tcoal_prio_params.timer_coalesce_fp_ns_max;
			TCOAL_PRIO_STAT(fp_tcl);
		} else if (tcoal_qos_adjust(cthread, tshift, tmax, pratelimited)) {
			TCOAL_PRIO_STAT(qos_tcl);
		} else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
			*tshift = tcoal_prio_params.timer_coalesce_ts_shift;
			*tmax = tcoal_prio_params.timer_coalesce_ts_ns_max;
			TCOAL_PRIO_STAT(ts_tcl);
		} else {
			TCOAL_PRIO_STAT(nc_tcl);
		}
	} else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
		*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
		*tmax = tcoal_prio_params.timer_coalesce_bg_ns_max;
		TCOAL_PRIO_STAT(bg_tcl);
	} else {
		*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
		*tmax = tcoal_prio_params.timer_coalesce_kt_ns_max;
		TCOAL_PRIO_STAT(kt_tcl);
	}
}

int timer_user_idle_level;

uint64_t
timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
{
	int32_t tcs_shift = 0;
	uint64_t tcs_ns_max = 0;
	uint64_t adjval;
	uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);

	if (mach_timer_coalescing_enabled && 
	    (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
		timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_ns_max, pratelimited);
	
		if (tcs_shift >= 0)
			adjval =  MIN((deadline - now) >> tcs_shift, tcs_ns_max);
		else
			adjval =  MIN((deadline - now) << (-tcs_shift), tcs_ns_max);
		/* Apply adjustments derived from "user idle level" heuristic */
		adjval += (adjval * timer_user_idle_level) >> 7;
		return adjval;
 	} else {
		return 0;
	}
}

boolean_t
timer_resort_threshold(uint64_t skew) {
	if (skew >= TIMER_RESORT_THRESHOLD_ABSTIME)
		return TRUE;
	else
		return FALSE;
}

int
ml_timer_get_user_idle_level(void) {
	return timer_user_idle_level;
}

kern_return_t ml_timer_set_user_idle_level(int ilevel) {
	boolean_t do_reeval = FALSE;

	if ((ilevel < 0) || (ilevel > 128))
		return KERN_INVALID_ARGUMENT;

	if (ilevel < timer_user_idle_level) {
		do_reeval = TRUE;
	}

	timer_user_idle_level = ilevel;

	if (do_reeval)
		ml_timer_evaluate();

	return KERN_SUCCESS;
}

/*
 * Return the local timer queue for a running processor
 * else return the boot processor's timer queue.
 */
mpqueue_head_t *
timer_queue_assign(
    uint64_t        deadline)
{
	cpu_data_t		*cdp = current_cpu_datap();
	mpqueue_head_t		*queue;

	if (cdp->cpu_running) {
		queue = &cdp->rtclock_timer.queue;

		if (deadline < cdp->rtclock_timer.deadline)
			timer_set_deadline(deadline);
	}
	else
		queue = &cpu_datap(master_cpu)->rtclock_timer.queue;

    return (queue);
}

void
timer_queue_cancel(
    mpqueue_head_t  *queue,
    uint64_t        deadline,
    uint64_t        new_deadline)
{
    if (queue == &current_cpu_datap()->rtclock_timer.queue) {
        if (deadline < new_deadline)
            timer_set_deadline(new_deadline);
    }
}

/*
 * timer_queue_migrate_cpu() is called from the Power-Management kext
 * when a logical processor goes idle (in a deep C-state) with a distant
 * deadline so that it's timer queue can be moved to another processor.
 * This target processor should be the least idle (most busy) --
 * currently this is the primary processor for the calling thread's package.
 * Locking restrictions demand that the target cpu must be the boot cpu.
 */
uint32_t
timer_queue_migrate_cpu(int target_cpu)
{
	cpu_data_t	*target_cdp = cpu_datap(target_cpu);
	cpu_data_t	*cdp = current_cpu_datap();
	int		ntimers_moved;

	assert(!ml_get_interrupts_enabled());
	assert(target_cpu != cdp->cpu_number);
	assert(target_cpu == master_cpu);

	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
		DECR_TIMER_MIGRATE | DBG_FUNC_START,
		target_cpu,
		cdp->rtclock_timer.deadline, (cdp->rtclock_timer.deadline >>32),
		0, 0);

	/*
	 * Move timer requests from the local queue to the target processor's.
	 * The return value is the number of requests moved. If this is 0,
	 * it indicates that the first (i.e. earliest) timer is earlier than
	 * the earliest for the target processor. Since this would force a
	 * resync, the move of this and all later requests is aborted.
	 */
	ntimers_moved = timer_queue_migrate(&cdp->rtclock_timer.queue,
					    &target_cdp->rtclock_timer.queue);

	/*
	 * Assuming we moved stuff, clear local deadline.
	 */
	if (ntimers_moved > 0) {
		cdp->rtclock_timer.deadline = EndOfAllTime;
		setPop(EndOfAllTime);
	}
 
	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
		DECR_TIMER_MIGRATE | DBG_FUNC_END,
		target_cpu, ntimers_moved, 0, 0, 0);

	return ntimers_moved;
}

mpqueue_head_t *
timer_queue_cpu(int cpu)
{
	return &cpu_datap(cpu)->rtclock_timer.queue;
}

void
timer_call_cpu(int cpu, void (*fn)(void *), void *arg)
{
	mp_cpus_call(cpu_to_cpumask(cpu), SYNC, fn, arg);
}

void
timer_call_nosync_cpu(int cpu, void (*fn)(void *), void *arg)
{
	/* XXX Needs error checking and retry */
	mp_cpus_call(cpu_to_cpumask(cpu), NOSYNC, fn, arg);
}
Commit	Line	Data
39236c6e A	1	/*
	2	* Copyright (c) 2000-2008 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_COPYRIGHT@
	30	*/
	31	/*
	32	* @APPLE_FREE_COPYRIGHT@
	33	*/
	34	/*
	35	* File: timer.c
	36	* Purpose: Routines for handling the machine independent timer.
	37	*/
	38
	39	#include <mach/mach_types.h>
	40
	41	#include <kern/timer_queue.h>
	42	#include <kern/timer_call.h>
	43	#include <kern/clock.h>
	44	#include <kern/thread.h>
	45	#include <kern/processor.h>
	46	#include <kern/macro_help.h>
	47	#include <kern/spl.h>
	48	#include <kern/timer_queue.h>
	49	#include <kern/pms.h>
	50
	51	#include <machine/commpage.h>
	52	#include <machine/machine_routines.h>
	53
	54	#include <sys/kdebug.h>
	55	#include <i386/cpu_data.h>
	56	#include <i386/cpu_topology.h>
	57	#include <i386/cpu_threads.h>
	58
	59	uint32_t spurious_timers;
	60
	61	/*
	62	* Event timer interrupt.
	63	*
	64	* XXX a drawback of this implementation is that events serviced earlier must not set deadlines
65	* that occur before the entire chain completes.
66	*
67	* XXX a better implementation would use a set of generic callouts and iterate over them
68	*/
69	void
70	timer_intr(int user_mode,
71	uint64_t rip)
72	{
73	uint64_t abstime;
74	rtclock_timer_t *mytimer;
75	cpu_data_t *pp;
76	int64_t latency;
77	uint64_t pmdeadline;
78	boolean_t timer_processed = FALSE;
79
80	pp = current_cpu_datap();
81
82	SCHED_STATS_TIMER_POP(current_processor());
83
84	abstime = mach_absolute_time(); /* Get the time now */
85
86	/* has a pending clock timer expired? */
87	mytimer = &pp->rtclock_timer; /* Point to the event timer */
88
89	if ((timer_processed = ((mytimer->deadline <= abstime) \|\|
90	(abstime >= (mytimer->queue.earliest_soft_deadline))))) {
91	/*
92	* Log interrupt service latency (-ve value expected by tool)
93	* a non-PM event is expected next.
94	* The requested deadline may be earlier than when it was set
95	* - use MAX to avoid reporting bogus latencies.
96	*/
97	latency = (int64_t) (abstime - MAX(mytimer->deadline,
98	mytimer->when_set));
99	/* Log zero timer latencies when opportunistically processing
100	* coalesced timers.
101	*/
102	if (latency < 0) {
103	TCOAL_DEBUG(0xEEEE0000, abstime, mytimer->queue.earliest_soft_deadline, abstime - mytimer->queue.earliest_soft_deadline, 0, 0);
104	latency = 0;
105	}
106
107	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
108	DECR_TRAP_LATENCY \| DBG_FUNC_NONE,
109	-latency,
110	((user_mode != 0) ? rip : VM_KERNEL_UNSLIDE(rip)),
111	user_mode, 0, 0);
112
113	mytimer->has_expired = TRUE; /* Remember that we popped */
114	mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime);
115	mytimer->has_expired = FALSE;
116
117	/* Get the time again since we ran a bit */
118	abstime = mach_absolute_time();
119	mytimer->when_set = abstime;
120	}
121
122	/* is it time for power management state change? */
123	if ((pmdeadline = pmCPUGetDeadline(pp)) && (pmdeadline <= abstime)) {
124	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
125	DECR_PM_DEADLINE \| DBG_FUNC_START,
126	0, 0, 0, 0, 0);
127	pmCPUDeadline(pp);
128	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
129	DECR_PM_DEADLINE \| DBG_FUNC_END,
130	0, 0, 0, 0, 0);
131	timer_processed = TRUE;
132	}
133
134	/* schedule our next deadline */
135	x86_lcpu()->rtcDeadline = EndOfAllTime;
136	timer_resync_deadlines();
137
138	if (__improbable(timer_processed == FALSE))
139	spurious_timers++;
140	}
141
142	/*
143	* Set the clock deadline.
144	*/
145	void timer_set_deadline(uint64_t deadline)
146	{
147	rtclock_timer_t *mytimer;
148	spl_t s;
149	cpu_data_t *pp;
150
151	s = splclock(); /* no interruptions */
152	pp = current_cpu_datap();
153
154	mytimer = &pp->rtclock_timer; /* Point to the timer itself */
155	mytimer->deadline = deadline; /* Set new expiration time */
156	mytimer->when_set = mach_absolute_time();
157
158	timer_resync_deadlines();
159
160	splx(s);
161	}
162
163	/*
164	* Re-evaluate the outstanding deadlines and select the most proximate.
165	*
166	* Should be called at splclock.
167	*/
168	void
169	timer_resync_deadlines(void)
170	{
171	uint64_t deadline = EndOfAllTime;
172	uint64_t pmdeadline;
173	rtclock_timer_t *mytimer;
174	spl_t s = splclock();
175	cpu_data_t *pp;
176	uint32_t decr;
177
178	pp = current_cpu_datap();
179	if (!pp->cpu_running)
180	/* There's really nothing to do if this processor is down */
181	return;
182
183	/*
184	* If we have a clock timer set, pick that.
185	*/
186	mytimer = &pp->rtclock_timer;
187	if (!mytimer->has_expired &&
188	0 < mytimer->deadline && mytimer->deadline < EndOfAllTime)
189	deadline = mytimer->deadline;
190
191	/*
192	* If we have a power management deadline, see if that's earlier.
193	*/
194	pmdeadline = pmCPUGetDeadline(pp);
195	if (0 < pmdeadline && pmdeadline < deadline)
196	deadline = pmdeadline;
197
198	/*
199	* Go and set the "pop" event.
200	*/
201	decr = (uint32_t) setPop(deadline);
202
203	/* Record non-PM deadline for latency tool */
204	if (decr != 0 && deadline != pmdeadline) {
205	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
206	DECR_SET_DEADLINE \| DBG_FUNC_NONE,
207	decr, 2,
208	deadline,
209	mytimer->queue.count, 0);
210	}
211	splx(s);
212	}
213
214	void
215	timer_queue_expire_local(
216	__unused void *arg)
217	{
218	rtclock_timer_t *mytimer;
219	uint64_t abstime;
220	cpu_data_t *pp;
221
222	pp = current_cpu_datap();
223
224	mytimer = &pp->rtclock_timer;
225	abstime = mach_absolute_time();
226
227	mytimer->has_expired = TRUE;
228	mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime);
229	mytimer->has_expired = FALSE;
230	mytimer->when_set = mach_absolute_time();
231
232	timer_resync_deadlines();
233	}
234
235	void
236	timer_queue_expire_rescan(
237	__unused void *arg)
238	{
239	rtclock_timer_t *mytimer;
240	uint64_t abstime;
241	cpu_data_t *pp;
242
243	assert(ml_get_interrupts_enabled() == FALSE);
244	pp = current_cpu_datap();
245
246	mytimer = &pp->rtclock_timer;
247	abstime = mach_absolute_time();
248
249	mytimer->has_expired = TRUE;
250	mytimer->deadline = timer_queue_expire_with_options(&mytimer->queue, abstime, TRUE);
251	mytimer->has_expired = FALSE;
252	mytimer->when_set = mach_absolute_time();
253
254	timer_resync_deadlines();
255	}
256
257	/* N.B.: Max leeway values assume 1GHz timebase */
258	timer_coalescing_priority_params_t tcoal_prio_params =
259	{
260	/* Deadline scale values for each thread attribute */
261	0, -5, 3, 3, 3,
262	/* Maximum leeway in abstime for each thread attribute */
263	0ULL, 100*NSEC_PER_MSEC, NSEC_PER_MSEC, NSEC_PER_MSEC, NSEC_PER_MSEC,
264	/* Deadline scale values for each latency QoS tier */
265	{3, 2, 1, -2, -15, -15},
266	/* Maximum leeway in abstime for each latency QoS Tier*/
267	{1NSEC_PER_MSEC, 5NSEC_PER_MSEC, 20NSEC_PER_MSEC, 75NSEC_PER_MSEC,
268	10NSEC_PER_SEC, 10NSEC_PER_SEC},
269	/* Signifies that the tier requires rate-limiting */
270	{FALSE, FALSE, FALSE, FALSE, TRUE, TRUE}
271	};
272	#define TIMER_RESORT_THRESHOLD_ABSTIME (50 * NSEC_PER_MSEC)
273
274	#if TCOAL_PRIO_STATS
275	int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
276	#define TCOAL_PRIO_STAT(x) (x++)
277	#else
278	#define TCOAL_PRIO_STAT(x)
279	#endif
280
281	/* Select timer coalescing window based on per-task quality-of-service hints */
282	static boolean_t tcoal_qos_adjust(thread_t t, int32_t tshift, uint64_t tmax, boolean_t *pratelimited) {
283	uint32_t latency_qos;
284	boolean_t adjusted = FALSE;
285	task_t ctask = t->task;
286
287	if (ctask) {
288	latency_qos = proc_get_effective_task_policy(ctask, TASK_POLICY_LATENCY_QOS);
289
290	assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
291
292	if (latency_qos) {
293	*tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
294	*tmax = tcoal_prio_params.latency_qos_ns_max[latency_qos - 1];
295	*pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
296	adjusted = TRUE;
297	}
298	}
299	return adjusted;
300	}
301
302	/* Adjust timer deadlines based on priority of the thread and the
303	* urgency value provided at timeout establishment. With this mechanism,
304	* timers are no longer necessarily sorted in order of soft deadline
305	* on a given timer queue, i.e. they may be differentially skewed.
306	* In the current scheme, this could lead to fewer pending timers
307	* processed than is technically possible when the HW deadline arrives.
308	*/
309	static void
310	timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t tshift, uint64_t tmax, boolean_t *pratelimited) {
311	int16_t tpri = cthread->sched_pri;
312
313	if ((urgency & TIMER_CALL_USER_MASK) != 0) {
314	if (tpri >= BASEPRI_RTQUEUES \|\|
315	urgency == TIMER_CALL_USER_CRITICAL) {
316	*tshift = tcoal_prio_params.timer_coalesce_rt_shift;
317	*tmax = tcoal_prio_params.timer_coalesce_rt_ns_max;
318	TCOAL_PRIO_STAT(rt_tcl);
319	} else if ((urgency == TIMER_CALL_USER_BACKGROUND) \|\|
320	proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG)) {
321	/* Determine if timer should be subjected to a lower QoS */
322	if (tcoal_qos_adjust(cthread, tshift, tmax, pratelimited)) {
323	if (*tmax > tcoal_prio_params.timer_coalesce_bg_ns_max) {
324	return;
325	} else {
326	*pratelimited = FALSE;
327	}
328	}
329	*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
330	*tmax = tcoal_prio_params.timer_coalesce_bg_ns_max;
331	TCOAL_PRIO_STAT(bg_tcl);
332	} else if (tpri >= MINPRI_KERNEL) {
333	*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
334	*tmax = tcoal_prio_params.timer_coalesce_kt_ns_max;
335	TCOAL_PRIO_STAT(kt_tcl);
336	} else if (cthread->sched_mode == TH_MODE_FIXED) {
337	*tshift = tcoal_prio_params.timer_coalesce_fp_shift;
338	*tmax = tcoal_prio_params.timer_coalesce_fp_ns_max;
339	TCOAL_PRIO_STAT(fp_tcl);
340	} else if (tcoal_qos_adjust(cthread, tshift, tmax, pratelimited)) {
341	TCOAL_PRIO_STAT(qos_tcl);
342	} else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
343	*tshift = tcoal_prio_params.timer_coalesce_ts_shift;
344	*tmax = tcoal_prio_params.timer_coalesce_ts_ns_max;
345	TCOAL_PRIO_STAT(ts_tcl);
346	} else {
347	TCOAL_PRIO_STAT(nc_tcl);
348	}
349	} else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
350	*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
351	*tmax = tcoal_prio_params.timer_coalesce_bg_ns_max;
352	TCOAL_PRIO_STAT(bg_tcl);
353	} else {
354	*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
355	*tmax = tcoal_prio_params.timer_coalesce_kt_ns_max;
356	TCOAL_PRIO_STAT(kt_tcl);
357	}
358	}
359
360	int timer_user_idle_level;
361
362	uint64_t
363	timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
364	{
365	int32_t tcs_shift = 0;
366	uint64_t tcs_ns_max = 0;
367	uint64_t adjval;
368	uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
369
370	if (mach_timer_coalescing_enabled &&
371	(deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
372	timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_ns_max, pratelimited);
373
374	if (tcs_shift >= 0)
375	adjval = MIN((deadline - now) >> tcs_shift, tcs_ns_max);
376	else
377	adjval = MIN((deadline - now) << (-tcs_shift), tcs_ns_max);
378	/* Apply adjustments derived from "user idle level" heuristic */
379	adjval += (adjval * timer_user_idle_level) >> 7;
380	return adjval;
381	} else {
382	return 0;
383	}
384	}
385
386	boolean_t
387	timer_resort_threshold(uint64_t skew) {
388	if (skew >= TIMER_RESORT_THRESHOLD_ABSTIME)
389	return TRUE;
390	else
391	return FALSE;
392	}
393
394	int
395	ml_timer_get_user_idle_level(void) {
396	return timer_user_idle_level;
397	}
398
399	kern_return_t ml_timer_set_user_idle_level(int ilevel) {
400	boolean_t do_reeval = FALSE;
401
402	if ((ilevel < 0) \|\| (ilevel > 128))
403	return KERN_INVALID_ARGUMENT;
404
405	if (ilevel < timer_user_idle_level) {
406	do_reeval = TRUE;
407	}
408
409	timer_user_idle_level = ilevel;
410
411	if (do_reeval)
412	ml_timer_evaluate();
413
414	return KERN_SUCCESS;
415	}
416
417	/*
418	* Return the local timer queue for a running processor
419	* else return the boot processor's timer queue.
420	*/
421	mpqueue_head_t *
422	timer_queue_assign(
423	uint64_t deadline)
424	{
425	cpu_data_t *cdp = current_cpu_datap();
426	mpqueue_head_t *queue;
427
428	if (cdp->cpu_running) {
429	queue = &cdp->rtclock_timer.queue;
430
431	if (deadline < cdp->rtclock_timer.deadline)
432	timer_set_deadline(deadline);
433	}
434	else
435	queue = &cpu_datap(master_cpu)->rtclock_timer.queue;
436
437	return (queue);
438	}
439
440	void
441	timer_queue_cancel(
442	mpqueue_head_t *queue,
443	uint64_t deadline,
444	uint64_t new_deadline)
445	{
446	if (queue == &current_cpu_datap()->rtclock_timer.queue) {
447	if (deadline < new_deadline)
448	timer_set_deadline(new_deadline);
449	}
450	}
451
452	/*
453	* timer_queue_migrate_cpu() is called from the Power-Management kext
454	* when a logical processor goes idle (in a deep C-state) with a distant
455	* deadline so that it's timer queue can be moved to another processor.
456	* This target processor should be the least idle (most busy) --
457	* currently this is the primary processor for the calling thread's package.
458	* Locking restrictions demand that the target cpu must be the boot cpu.
459	*/
460	uint32_t
461	timer_queue_migrate_cpu(int target_cpu)
462	{
463	cpu_data_t *target_cdp = cpu_datap(target_cpu);
464	cpu_data_t *cdp = current_cpu_datap();
465	int ntimers_moved;
466
467	assert(!ml_get_interrupts_enabled());
468	assert(target_cpu != cdp->cpu_number);
469	assert(target_cpu == master_cpu);
470
471	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
472	DECR_TIMER_MIGRATE \| DBG_FUNC_START,
473	target_cpu,
474	cdp->rtclock_timer.deadline, (cdp->rtclock_timer.deadline >>32),
475	0, 0);
476
477	/*
478	* Move timer requests from the local queue to the target processor's.
479	* The return value is the number of requests moved. If this is 0,
480	* it indicates that the first (i.e. earliest) timer is earlier than
481	* the earliest for the target processor. Since this would force a
482	* resync, the move of this and all later requests is aborted.
483	*/
484	ntimers_moved = timer_queue_migrate(&cdp->rtclock_timer.queue,
485	&target_cdp->rtclock_timer.queue);
486
487	/*
488	* Assuming we moved stuff, clear local deadline.
489	*/
490	if (ntimers_moved > 0) {
491	cdp->rtclock_timer.deadline = EndOfAllTime;
492	setPop(EndOfAllTime);
493	}
494
495	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
496	DECR_TIMER_MIGRATE \| DBG_FUNC_END,
497	target_cpu, ntimers_moved, 0, 0, 0);
498
499	return ntimers_moved;
500	}
501
502	mpqueue_head_t *
503	timer_queue_cpu(int cpu)
504	{
505	return &cpu_datap(cpu)->rtclock_timer.queue;
506	}
507
508	void
509	timer_call_cpu(int cpu, void (fn)(void ), void *arg)
510	{
511	mp_cpus_call(cpu_to_cpumask(cpu), SYNC, fn, arg);
512	}
513
514	void
515	timer_call_nosync_cpu(int cpu, void (fn)(void ), void *arg)
516	{
517	/* XXX Needs error checking and retry */
518	mp_cpus_call(cpu_to_cpumask(cpu), NOSYNC, fn, arg);
519	}
520