git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1993-2008 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* Timer interrupt callout module.
	30	*/
	31
	32	#include <mach/mach_types.h>
	33
	34	#include <kern/clock.h>
	35	#include <kern/smp.h>
	36	#include <kern/processor.h>
	37	#include <kern/timer_call.h>
	38	#include <kern/timer_queue.h>
	39	#include <kern/call_entry.h>
	40	#include <kern/thread.h>
	41	#include <kern/policy_internal.h>
	42
	43	#include <sys/kdebug.h>
	44
	45	#if CONFIG_DTRACE
	46	#include <mach/sdt.h>
	47	#endif
	48
	49
	50	#if DEBUG
	51	#define TIMER_ASSERT 1
	52	#endif
	53
	54	//#define TIMER_ASSERT 1
	55	//#define TIMER_DBG 1
	56
	57	#if TIMER_DBG
	58	#define DBG(x...) kprintf("DBG: " x);
	59	#else
	60	#define DBG(x...)
	61	#endif
	62
	63	#if TIMER_TRACE
	64	#define TIMER_KDEBUG_TRACE KERNEL_DEBUG_CONSTANT_IST
	65	#else
	66	#define TIMER_KDEBUG_TRACE(x...)
	67	#endif
	68
	69
	70	lck_grp_t timer_call_lck_grp;
	71	lck_attr_t timer_call_lck_attr;
	72	lck_grp_attr_t timer_call_lck_grp_attr;
	73
	74	lck_grp_t timer_longterm_lck_grp;
	75	lck_attr_t timer_longterm_lck_attr;
	76	lck_grp_attr_t timer_longterm_lck_grp_attr;
	77
	78	/* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
	79	#if __SMP__
	80	#define timer_queue_lock_spin(queue) \
	81	lck_mtx_lock_spin_always(&queue->lock_data)
	82
	83	#define timer_queue_unlock(queue) \
	84	lck_mtx_unlock_always(&queue->lock_data)
	85	#else
	86	#define timer_queue_lock_spin(queue) (void)1
	87	#define timer_queue_unlock(queue) (void)1
	88	#endif
	89
	90	#define QUEUE(x) ((queue_t)(x))
	91	#define MPQUEUE(x) ((mpqueue_head_t *)(x))
	92	#define TIMER_CALL(x) ((timer_call_t)(x))
	93	#define TCE(x) (&(x->call_entry))
	94	/*
	95	* The longterm timer object is a global structure holding all timers
	96	* beyond the short-term, local timer queue threshold. The boot processor
	97	* is responsible for moving each timer to its local timer queue
	98	* if and when that timer becomes due within the threshold.
	99	*/
	100
	101	/* Sentinel for "no time set": */
	102	#define TIMER_LONGTERM_NONE EndOfAllTime
	103	/* The default threadhold is the delta above which a timer is "long-term" */
	104	#if defined(__x86_64__)
	105	#define TIMER_LONGTERM_THRESHOLD (1ULL * NSEC_PER_SEC) /* 1 sec */
	106	#else
	107	#define TIMER_LONGTERM_THRESHOLD TIMER_LONGTERM_NONE /* disabled */
	108	#endif
	109
	110	/*
	111	* The scan_limit throttles processing of the longterm queue.
	112	* If the scan time exceeds this limit, we terminate, unlock
	113	* and defer for scan_interval. This prevents unbounded holding of
	114	* timer queue locks with interrupts masked.
	115	*/
	116	#define TIMER_LONGTERM_SCAN_LIMIT (100ULL * NSEC_PER_USEC) /* 100 us */
	117	#define TIMER_LONGTERM_SCAN_INTERVAL (100ULL * NSEC_PER_USEC) /* 100 us */
	118	/* Sentinel for "scan limit exceeded": */
	119	#define TIMER_LONGTERM_SCAN_AGAIN 0
	120
	121	typedef struct {
	122	uint64_t interval; /* longterm timer interval */
	123	uint64_t margin; /* fudge factor (10% of interval */
	124	uint64_t deadline; /* first/soonest longterm deadline */
	125	uint64_t preempted; /* sooner timer has pre-empted */
	126	timer_call_t call; /* first/soonest longterm timer call */
	127	uint64_t deadline_set; /* next timer set */
	128	timer_call_data_t timer; /* timer used by threshold management */
	129	/* Stats: */
	130	uint64_t scans; /* num threshold timer scans */
	131	uint64_t preempts; /* num threshold reductions */
	132	uint64_t latency; /* average threshold latency */
	133	uint64_t latency_min; /* minimum threshold latency */
	134	uint64_t latency_max; /* maximum threshold latency */
	135	} threshold_t;
	136
	137	typedef struct {
	138	mpqueue_head_t queue; /* longterm timer list */
	139	uint64_t enqueues; /* num timers queued */
	140	uint64_t dequeues; /* num timers dequeued */
	141	uint64_t escalates; /* num timers becoming shortterm */
	142	uint64_t scan_time; /* last time the list was scanned */
	143	threshold_t threshold; /* longterm timer threshold */
	144	uint64_t scan_limit; /* maximum scan time */
	145	uint64_t scan_interval; /* interval between LT "escalation" scans */
	146	uint64_t scan_pauses; /* num scans exceeding time limit */
	147	} timer_longterm_t;
	148
	149	timer_longterm_t timer_longterm = {
	150	.scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
	151	.scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
	152	};
	153
	154	static mpqueue_head_t *timer_longterm_queue = NULL;
	155
	156	static void timer_longterm_init(void);
	157	static void timer_longterm_callout(
	158	timer_call_param_t p0,
	159	timer_call_param_t p1);
	160	extern void timer_longterm_scan(
	161	timer_longterm_t *tlp,
	162	uint64_t now);
	163	static void timer_longterm_update(
	164	timer_longterm_t *tlp);
	165	static void timer_longterm_update_locked(
	166	timer_longterm_t *tlp);
	167	static mpqueue_head_t * timer_longterm_enqueue_unlocked(
	168	timer_call_t call,
	169	uint64_t now,
	170	uint64_t deadline,
	171	mpqueue_head_t ** old_queue,
	172	uint64_t soft_deadline,
	173	uint64_t ttd,
	174	timer_call_param_t param1,
	175	uint32_t callout_flags);
	176	static void timer_longterm_dequeued_locked(
	177	timer_call_t call);
	178
	179	uint64_t past_deadline_timers;
	180	uint64_t past_deadline_deltas;
	181	uint64_t past_deadline_longest;
	182	uint64_t past_deadline_shortest = ~0ULL;
	183	enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
	184
	185	uint64_t past_deadline_timer_adjustment;
	186
	187	static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
	188	boolean_t mach_timer_coalescing_enabled = TRUE;
	189
	190	mpqueue_head_t *timer_call_enqueue_deadline_unlocked(
	191	timer_call_t call,
	192	mpqueue_head_t *queue,
	193	uint64_t deadline,
	194	uint64_t soft_deadline,
	195	uint64_t ttd,
	196	timer_call_param_t param1,
	197	uint32_t flags);
	198
	199	mpqueue_head_t *timer_call_dequeue_unlocked(
	200	timer_call_t call);
	201
	202	timer_coalescing_priority_params_t tcoal_prio_params;
	203
	204	#if TCOAL_PRIO_STATS
	205	int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
	206	#define TCOAL_PRIO_STAT(x) (x++)
	207	#else
	208	#define TCOAL_PRIO_STAT(x)
	209	#endif
	210
	211	static void
	212	timer_call_init_abstime(void)
	213	{
	214	int i;
	215	uint64_t result;
	216	timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
	217	nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
	218	nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
	219	tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
	220	nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
	221	tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
	222	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
	223	tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
	224	tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
	225	tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
	226	tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
	227	tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
	228	tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
	229
	230	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
	231	&tcoal_prio_params.timer_coalesce_rt_abstime_max);
	232	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
	233	&tcoal_prio_params.timer_coalesce_bg_abstime_max);
	234	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
	235	&tcoal_prio_params.timer_coalesce_kt_abstime_max);
	236	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
	237	&tcoal_prio_params.timer_coalesce_fp_abstime_max);
	238	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
	239	&tcoal_prio_params.timer_coalesce_ts_abstime_max);
	240
	241	for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
	242	tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
	243	nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
	244	&tcoal_prio_params.latency_qos_abstime_max[i]);
	245	tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
	246	}
	247	}
	248
	249
	250	void
	251	timer_call_init(void)
	252	{
	253	lck_attr_setdefault(&timer_call_lck_attr);
	254	lck_grp_attr_setdefault(&timer_call_lck_grp_attr);
	255	lck_grp_init(&timer_call_lck_grp, "timer_call", &timer_call_lck_grp_attr);
	256
	257	timer_longterm_init();
	258	timer_call_init_abstime();
	259	}
	260
	261
	262	void
	263	timer_call_queue_init(mpqueue_head_t *queue)
	264	{
	265	DBG("timer_call_queue_init(%p)\n", queue);
	266	mpqueue_init(queue, &timer_call_lck_grp, &timer_call_lck_attr);
	267	}
	268
	269
	270	void
	271	timer_call_setup(
	272	timer_call_t call,
	273	timer_call_func_t func,
	274	timer_call_param_t param0)
	275	{
	276	DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
	277	call_entry_setup(TCE(call), func, param0);
	278	simple_lock_init(&(call)->lock, 0);
	279	call->async_dequeue = FALSE;
	280	}
	281	#if TIMER_ASSERT
	282	static __inline__ mpqueue_head_t *
	283	timer_call_entry_dequeue(
	284	timer_call_t entry)
	285	{
	286	mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
	287
	288	if (!hw_lock_held((hw_lock_t)&entry->lock)) {
	289	panic("_call_entry_dequeue() "
	290	"entry %p is not locked\n", entry);
	291	}
	292	/*
	293	* XXX The queue lock is actually a mutex in spin mode
	294	* but there's no way to test for it being held
	295	* so we pretend it's a spinlock!
	296	*/
	297	if (!hw_lock_held((hw_lock_t)&old_queue->lock_data)) {
	298	panic("_call_entry_dequeue() "
	299	"queue %p is not locked\n", old_queue);
	300	}
	301
	302	call_entry_dequeue(TCE(entry));
	303	old_queue->count--;
	304
	305	return old_queue;
	306	}
	307
	308	static __inline__ mpqueue_head_t *
	309	timer_call_entry_enqueue_deadline(
	310	timer_call_t entry,
	311	mpqueue_head_t *queue,
	312	uint64_t deadline)
	313	{
	314	mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
	315
	316	if (!hw_lock_held((hw_lock_t)&entry->lock)) {
	317	panic("_call_entry_enqueue_deadline() "
	318	"entry %p is not locked\n", entry);
	319	}
	320	/* XXX More lock pretense: */
	321	if (!hw_lock_held((hw_lock_t)&queue->lock_data)) {
	322	panic("_call_entry_enqueue_deadline() "
	323	"queue %p is not locked\n", queue);
	324	}
	325	if (old_queue != NULL && old_queue != queue) {
	326	panic("_call_entry_enqueue_deadline() "
	327	"old_queue %p != queue", old_queue);
	328	}
	329
	330	call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
	331
	332	/* For efficiency, track the earliest soft deadline on the queue, so that
	333	* fuzzy decisions can be made without lock acquisitions.
	334	*/
	335	timer_call_t thead = (timer_call_t)queue_first(&queue->head);
	336
	337	queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
	338
	339	if (old_queue) {
	340	old_queue->count--;
	341	}
	342	queue->count++;
	343
	344	return old_queue;
	345	}
	346
	347	#else
	348
	349	static __inline__ mpqueue_head_t *
	350	timer_call_entry_dequeue(
	351	timer_call_t entry)
	352	{
	353	mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
	354
	355	call_entry_dequeue(TCE(entry));
	356	old_queue->count--;
	357
	358	return old_queue;
	359	}
	360
	361	static __inline__ mpqueue_head_t *
	362	timer_call_entry_enqueue_deadline(
	363	timer_call_t entry,
	364	mpqueue_head_t *queue,
	365	uint64_t deadline)
	366	{
	367	mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
	368
	369	call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
	370
	371	/* For efficiency, track the earliest soft deadline on the queue,
	372	* so that fuzzy decisions can be made without lock acquisitions.
	373	*/
	374
	375	timer_call_t thead = (timer_call_t)queue_first(&queue->head);
	376	queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
	377
	378	if (old_queue) {
	379	old_queue->count--;
	380	}
	381	queue->count++;
	382
	383	return old_queue;
	384	}
	385
	386	#endif
	387
	388	static __inline__ void
	389	timer_call_entry_enqueue_tail(
	390	timer_call_t entry,
	391	mpqueue_head_t *queue)
	392	{
	393	call_entry_enqueue_tail(TCE(entry), QUEUE(queue));
	394	queue->count++;
	395	return;
	396	}
	397
	398	/*
	399	* Remove timer entry from its queue but don't change the queue pointer
	400	* and set the async_dequeue flag. This is locking case 2b.
	401	*/
	402	static __inline__ void
	403	timer_call_entry_dequeue_async(
	404	timer_call_t entry)
	405	{
	406	mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
	407	if (old_queue) {
	408	old_queue->count--;
	409	(void) remque(qe(entry));
	410	entry->async_dequeue = TRUE;
	411	}
	412	return;
	413	}
	414
	415	#if TIMER_ASSERT
	416	unsigned timer_call_enqueue_deadline_unlocked_async1;
	417	unsigned timer_call_enqueue_deadline_unlocked_async2;
	418	#endif
	419	/*
	420	* Assumes call_entry and queues unlocked, interrupts disabled.
	421	*/
	422	__inline__ mpqueue_head_t *
	423	timer_call_enqueue_deadline_unlocked(
	424	timer_call_t call,
	425	mpqueue_head_t *queue,
	426	uint64_t deadline,
	427	uint64_t soft_deadline,
	428	uint64_t ttd,
	429	timer_call_param_t param1,
	430	uint32_t callout_flags)
	431	{
	432	call_entry_t entry = TCE(call);
	433	mpqueue_head_t *old_queue;
	434
	435	DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
	436
	437	simple_lock(&call->lock, LCK_GRP_NULL);
	438
	439	old_queue = MPQUEUE(entry->queue);
	440
	441	if (old_queue != NULL) {
	442	timer_queue_lock_spin(old_queue);
	443	if (call->async_dequeue) {
	444	/* collision (1c): timer already dequeued, clear flag */
	445	#if TIMER_ASSERT
	446	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	447	DECR_TIMER_ASYNC_DEQ \| DBG_FUNC_NONE,
	448	VM_KERNEL_UNSLIDE_OR_PERM(call),
	449	call->async_dequeue,
	450	VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
	451	0x1c, 0);
	452	timer_call_enqueue_deadline_unlocked_async1++;
	453	#endif
	454	call->async_dequeue = FALSE;
	455	entry->queue = NULL;
	456	} else if (old_queue != queue) {
	457	timer_call_entry_dequeue(call);
	458	#if TIMER_ASSERT
	459	timer_call_enqueue_deadline_unlocked_async2++;
	460	#endif
	461	}
	462	if (old_queue == timer_longterm_queue) {
	463	timer_longterm_dequeued_locked(call);
	464	}
	465	if (old_queue != queue) {
	466	timer_queue_unlock(old_queue);
	467	timer_queue_lock_spin(queue);
	468	}
	469	} else {
	470	timer_queue_lock_spin(queue);
	471	}
	472
	473	call->soft_deadline = soft_deadline;
	474	call->flags = callout_flags;
	475	TCE(call)->param1 = param1;
	476	call->ttd = ttd;
	477
	478	timer_call_entry_enqueue_deadline(call, queue, deadline);
	479	timer_queue_unlock(queue);
	480	simple_unlock(&call->lock);
	481
	482	return old_queue;
	483	}
	484
	485	#if TIMER_ASSERT
	486	unsigned timer_call_dequeue_unlocked_async1;
	487	unsigned timer_call_dequeue_unlocked_async2;
	488	#endif
	489	mpqueue_head_t *
	490	timer_call_dequeue_unlocked(
	491	timer_call_t call)
	492	{
	493	call_entry_t entry = TCE(call);
	494	mpqueue_head_t *old_queue;
	495
	496	DBG("timer_call_dequeue_unlocked(%p)\n", call);
	497
	498	simple_lock(&call->lock, LCK_GRP_NULL);
	499	old_queue = MPQUEUE(entry->queue);
	500	#if TIMER_ASSERT
	501	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	502	DECR_TIMER_ASYNC_DEQ \| DBG_FUNC_NONE,
	503	VM_KERNEL_UNSLIDE_OR_PERM(call),
	504	call->async_dequeue,
	505	VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
	506	0, 0);
	507	#endif
	508	if (old_queue != NULL) {
	509	timer_queue_lock_spin(old_queue);
	510	if (call->async_dequeue) {
	511	/* collision (1c): timer already dequeued, clear flag */
	512	#if TIMER_ASSERT
	513	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	514	DECR_TIMER_ASYNC_DEQ \| DBG_FUNC_NONE,
	515	VM_KERNEL_UNSLIDE_OR_PERM(call),
	516	call->async_dequeue,
	517	VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
	518	0x1c, 0);
	519	timer_call_dequeue_unlocked_async1++;
	520	#endif
	521	call->async_dequeue = FALSE;
	522	entry->queue = NULL;
	523	} else {
	524	timer_call_entry_dequeue(call);
	525	}
	526	if (old_queue == timer_longterm_queue) {
	527	timer_longterm_dequeued_locked(call);
	528	}
	529	timer_queue_unlock(old_queue);
	530	}
	531	simple_unlock(&call->lock);
	532	return old_queue;
	533	}
	534
	535	static uint64_t
	536	past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
	537	{
	538	uint64_t delta = (ctime - deadline);
	539
	540	past_deadline_timers++;
	541	past_deadline_deltas += delta;
	542	if (delta > past_deadline_longest) {
	543	past_deadline_longest = deadline;
	544	}
	545	if (delta < past_deadline_shortest) {
	546	past_deadline_shortest = delta;
	547	}
	548
	549	return ctime + past_deadline_timer_adjustment;
	550	}
	551
	552	/*
	553	* Timer call entry locking model
	554	* ==============================
	555	*
	556	* Timer call entries are linked on per-cpu timer queues which are protected
	557	* by the queue lock and the call entry lock. The locking protocol is:
	558	*
	559	* 0) The canonical locking order is timer call entry followed by queue.
	560	*
	561	* 1) With only the entry lock held, entry.queue is valid:
	562	* 1a) NULL: the entry is not queued, or
	563	* 1b) non-NULL: this queue must be locked before the entry is modified.
	564	* After locking the queue, the call.async_dequeue flag must be checked:
	565	* 1c) TRUE: the entry was removed from the queue by another thread
	566	* and we must NULL the entry.queue and reset this flag, or
	567	* 1d) FALSE: (ie. queued), the entry can be manipulated.
	568	*
	569	* 2) If a queue lock is obtained first, the queue is stable:
	570	* 2a) If a try-lock of a queued entry succeeds, the call can be operated on
	571	* and dequeued.
	572	* 2b) If a try-lock fails, it indicates that another thread is attempting
	573	* to change the entry and move it to a different position in this queue
	574	* or to different queue. The entry can be dequeued but it should not be
	575	* operated upon since it is being changed. Furthermore, we don't null
	576	* the entry.queue pointer (protected by the entry lock we don't own).
	577	* Instead, we set the async_dequeue flag -- see (1c).
	578	* 2c) Same as 2b but occurring when a longterm timer is matured.
	579	* 3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
	580	* should be manipulated with the appropriate timer queue lock held,
	581	* to prevent queue traversal observations from observing inconsistent
	582	* updates to an in-flight callout.
	583	*/
	584
	585	/*
	586	* Inlines timer_call_entry_dequeue() and timer_call_entry_enqueue_deadline()
	587	* cast between pointer types (mpqueue_head_t *) and (queue_t) so that
	588	* we can use the call_entry_dequeue() and call_entry_enqueue_deadline()
	589	* methods to operate on timer_call structs as if they are call_entry structs.
	590	* These structures are identical except for their queue head pointer fields.
	591	*
	592	* In the debug case, we assert that the timer call locking protocol
	593	* is being obeyed.
	594	*/
	595
	596	static boolean_t
	597	timer_call_enter_internal(
	598	timer_call_t call,
	599	timer_call_param_t param1,
	600	uint64_t deadline,
	601	uint64_t leeway,
	602	uint32_t flags,
	603	boolean_t ratelimited)
	604	{
	605	mpqueue_head_t *queue = NULL;
	606	mpqueue_head_t *old_queue;
	607	spl_t s;
	608	uint64_t slop;
	609	uint32_t urgency;
	610	uint64_t sdeadline, ttd;
	611
	612	assert(call->call_entry.func != NULL);
	613	s = splclock();
	614
	615	sdeadline = deadline;
	616	uint64_t ctime = mach_absolute_time();
	617
	618	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	619	DECR_TIMER_ENTER \| DBG_FUNC_START,
	620	VM_KERNEL_UNSLIDE_OR_PERM(call),
	621	VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
	622
	623	urgency = (flags & TIMER_CALL_URGENCY_MASK);
	624
	625	boolean_t slop_ratelimited = FALSE;
	626	slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
	627
	628	if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
	629	slop = leeway;
	630	}
	631
	632	if (UINT64_MAX - deadline <= slop) {
	633	deadline = UINT64_MAX;
	634	} else {
	635	deadline += slop;
	636	}
	637
	638	if (__improbable(deadline < ctime)) {
	639	deadline = past_deadline_timer_handle(deadline, ctime);
	640	sdeadline = deadline;
	641	}
	642
	643	if (ratelimited \|\| slop_ratelimited) {
	644	flags \|= TIMER_CALL_RATELIMITED;
	645	} else {
	646	flags &= ~TIMER_CALL_RATELIMITED;
	647	}
	648
	649	ttd = sdeadline - ctime;
	650	#if CONFIG_DTRACE
	651	DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func,
	652	timer_call_param_t, TCE(call)->param0, uint32_t, flags,
	653	(deadline - sdeadline),
	654	(ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
	655	#endif
	656
	657	/* Program timer callout parameters under the appropriate per-CPU or
	658	* longterm queue lock. The callout may have been previously enqueued
	659	* and in-flight on this or another timer queue.
	660	*/
	661	if (!ratelimited && !slop_ratelimited) {
	662	queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
	663	}
	664
	665	if (queue == NULL) {
	666	queue = timer_queue_assign(deadline);
	667	old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
	668	}
	669
	670	#if TIMER_TRACE
	671	TCE(call)->entry_time = ctime;
	672	#endif
	673
	674	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	675	DECR_TIMER_ENTER \| DBG_FUNC_END,
	676	VM_KERNEL_UNSLIDE_OR_PERM(call),
	677	(old_queue != NULL), deadline, queue->count, 0);
	678
	679	splx(s);
	680
	681	return old_queue != NULL;
	682	}
	683
	684	/*
	685	* timer_call_*()
	686	* return boolean indicating whether the call was previously queued.
	687	*/
	688	boolean_t
	689	timer_call_enter(
	690	timer_call_t call,
	691	uint64_t deadline,
	692	uint32_t flags)
	693	{
	694	return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
	695	}
	696
	697	boolean_t
	698	timer_call_enter1(
	699	timer_call_t call,
	700	timer_call_param_t param1,
	701	uint64_t deadline,
	702	uint32_t flags)
	703	{
	704	return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
	705	}
	706
	707	boolean_t
	708	timer_call_enter_with_leeway(
	709	timer_call_t call,
	710	timer_call_param_t param1,
	711	uint64_t deadline,
	712	uint64_t leeway,
	713	uint32_t flags,
	714	boolean_t ratelimited)
	715	{
	716	return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
	717	}
	718
	719	boolean_t
	720	timer_call_quantum_timer_enter(
	721	timer_call_t call,
	722	timer_call_param_t param1,
	723	uint64_t deadline,
	724	uint64_t ctime)
	725	{
	726	assert(call->call_entry.func != NULL);
	727	assert(ml_get_interrupts_enabled() == FALSE);
	728
	729	uint32_t flags = TIMER_CALL_SYS_CRITICAL \| TIMER_CALL_LOCAL;
	730
	731	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER \| DBG_FUNC_START,
	732	VM_KERNEL_UNSLIDE_OR_PERM(call),
	733	VM_KERNEL_ADDRHIDE(param1), deadline,
	734	flags, 0);
	735
	736	if (__improbable(deadline < ctime)) {
	737	deadline = past_deadline_timer_handle(deadline, ctime);
	738	}
	739
	740	uint64_t ttd = deadline - ctime;
	741	#if CONFIG_DTRACE
	742	DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func,
	743	timer_call_param_t, TCE(call)->param0, uint32_t, flags, 0,
	744	(ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
	745	#endif
	746
	747	quantum_timer_set_deadline(deadline);
	748	TCE(call)->deadline = deadline;
	749	TCE(call)->param1 = param1;
	750	call->ttd = ttd;
	751	call->flags = flags;
	752
	753	#if TIMER_TRACE
	754	TCE(call)->entry_time = ctime;
	755	#endif
	756
	757	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER \| DBG_FUNC_END,
	758	VM_KERNEL_UNSLIDE_OR_PERM(call),
	759	1, deadline, 0, 0);
	760
	761	return true;
	762	}
	763
	764
	765	boolean_t
	766	timer_call_quantum_timer_cancel(
	767	timer_call_t call)
	768	{
	769	assert(ml_get_interrupts_enabled() == FALSE);
	770
	771	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	772	DECR_TIMER_CANCEL \| DBG_FUNC_START,
	773	VM_KERNEL_UNSLIDE_OR_PERM(call), TCE(call)->deadline,
	774	0, call->flags, 0);
	775
	776	TCE(call)->deadline = 0;
	777	quantum_timer_set_deadline(0);
	778
	779	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	780	DECR_TIMER_CANCEL \| DBG_FUNC_END,
	781	VM_KERNEL_UNSLIDE_OR_PERM(call), 0,
	782	TCE(call)->deadline - mach_absolute_time(),
	783	TCE(call)->deadline - TCE(call)->entry_time, 0);
	784
	785	#if CONFIG_DTRACE
	786	DTRACE_TMR6(callout__cancel, timer_call_func_t, TCE(call)->func,
	787	timer_call_param_t, TCE(call)->param0, uint32_t, call->flags, 0,
	788	(call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF));
	789	#endif
	790
	791	return true;
	792	}
	793
	794	boolean_t
	795	timer_call_cancel(
	796	timer_call_t call)
	797	{
	798	mpqueue_head_t *old_queue;
	799	spl_t s;
	800
	801	s = splclock();
	802
	803	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	804	DECR_TIMER_CANCEL \| DBG_FUNC_START,
	805	VM_KERNEL_UNSLIDE_OR_PERM(call),
	806	TCE(call)->deadline, call->soft_deadline, call->flags, 0);
	807
	808	old_queue = timer_call_dequeue_unlocked(call);
	809
	810	if (old_queue != NULL) {
	811	timer_queue_lock_spin(old_queue);
	812	if (!queue_empty(&old_queue->head)) {
	813	timer_queue_cancel(old_queue, TCE(call)->deadline, CE(queue_first(&old_queue->head))->deadline);
	814	timer_call_t thead = (timer_call_t)queue_first(&old_queue->head);
	815	old_queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
	816	} else {
	817	timer_queue_cancel(old_queue, TCE(call)->deadline, UINT64_MAX);
	818	old_queue->earliest_soft_deadline = UINT64_MAX;
	819	}
	820	timer_queue_unlock(old_queue);
	821	}
	822	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	823	DECR_TIMER_CANCEL \| DBG_FUNC_END,
	824	VM_KERNEL_UNSLIDE_OR_PERM(call),
	825	VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
	826	TCE(call)->deadline - mach_absolute_time(),
	827	TCE(call)->deadline - TCE(call)->entry_time, 0);
	828	splx(s);
	829
	830	#if CONFIG_DTRACE
	831	DTRACE_TMR6(callout__cancel, timer_call_func_t, TCE(call)->func,
	832	timer_call_param_t, TCE(call)->param0, uint32_t, call->flags, 0,
	833	(call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF));
	834	#endif
	835
	836	return old_queue != NULL;
	837	}
	838
	839	static uint32_t timer_queue_shutdown_lock_skips;
	840	static uint32_t timer_queue_shutdown_discarded;
	841
	842	void
	843	timer_queue_shutdown(
	844	mpqueue_head_t *queue)
	845	{
	846	timer_call_t call;
	847	mpqueue_head_t *new_queue;
	848	spl_t s;
	849
	850
	851	DBG("timer_queue_shutdown(%p)\n", queue);
	852
	853	s = splclock();
	854
	855	/* Note comma operator in while expression re-locking each iteration */
	856	while ((void)timer_queue_lock_spin(queue), !queue_empty(&queue->head)) {
	857	call = TIMER_CALL(queue_first(&queue->head));
	858
	859	if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
	860	/*
	861	* case (2b) lock order inversion, dequeue and skip
	862	* Don't change the call_entry queue back-pointer
	863	* but set the async_dequeue field.
	864	*/
	865	timer_queue_shutdown_lock_skips++;
	866	timer_call_entry_dequeue_async(call);
	867	#if TIMER_ASSERT
	868	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	869	DECR_TIMER_ASYNC_DEQ \| DBG_FUNC_NONE,
	870	VM_KERNEL_UNSLIDE_OR_PERM(call),
	871	call->async_dequeue,
	872	VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
	873	0x2b, 0);
	874	#endif
	875	timer_queue_unlock(queue);
	876	continue;
	877	}
	878
	879	boolean_t call_local = ((call->flags & TIMER_CALL_LOCAL) != 0);
	880
	881	/* remove entry from old queue */
	882	timer_call_entry_dequeue(call);
	883	timer_queue_unlock(queue);
	884
	885	if (call_local == FALSE) {
	886	/* and queue it on new, discarding LOCAL timers */
	887	new_queue = timer_queue_assign(TCE(call)->deadline);
	888	timer_queue_lock_spin(new_queue);
	889	timer_call_entry_enqueue_deadline(
	890	call, new_queue, TCE(call)->deadline);
	891	timer_queue_unlock(new_queue);
	892	} else {
	893	timer_queue_shutdown_discarded++;
	894	}
	895
	896	assert(call_local == FALSE);
	897	simple_unlock(&call->lock);
	898	}
	899
	900	timer_queue_unlock(queue);
	901	splx(s);
	902	}
	903
	904
	905	void
	906	quantum_timer_expire(
	907	uint64_t deadline)
	908	{
	909	processor_t processor = current_processor();
	910	timer_call_t call = TIMER_CALL(&(processor->quantum_timer));
	911
	912	if (__improbable(TCE(call)->deadline > deadline)) {
	913	panic("CPU quantum timer deadlin out of sync with timer call deadline");
	914	}
	915
	916	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	917	DECR_TIMER_EXPIRE \| DBG_FUNC_NONE,
	918	VM_KERNEL_UNSLIDE_OR_PERM(call),
	919	TCE(call)->deadline,
	920	TCE(call)->deadline,
	921	TCE(call)->entry_time, 0);
	922
	923	timer_call_func_t func = TCE(call)->func;
	924	timer_call_param_t param0 = TCE(call)->param0;
	925	timer_call_param_t param1 = TCE(call)->param1;
	926
	927	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	928	DECR_TIMER_CALLOUT \| DBG_FUNC_START,
	929	VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
	930	VM_KERNEL_ADDRHIDE(param0),
	931	VM_KERNEL_ADDRHIDE(param1),
	932	0);
	933
	934	#if CONFIG_DTRACE
	935	DTRACE_TMR7(callout__start, timer_call_func_t, func,
	936	timer_call_param_t, param0, unsigned, call->flags,
	937	0, (call->ttd >> 32),
	938	(unsigned) (call->ttd & 0xFFFFFFFF), call);
	939	#endif
	940	(*func)(param0, param1);
	941
	942	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	943	DECR_TIMER_CALLOUT \| DBG_FUNC_END,
	944	VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
	945	VM_KERNEL_ADDRHIDE(param0),
	946	VM_KERNEL_ADDRHIDE(param1),
	947	0);
	948	}
	949
	950	static uint32_t timer_queue_expire_lock_skips;
	951	uint64_t
	952	timer_queue_expire_with_options(
	953	mpqueue_head_t *queue,
	954	uint64_t deadline,
	955	boolean_t rescan)
	956	{
	957	timer_call_t call = NULL;
	958	uint32_t tc_iterations = 0;
	959	DBG("timer_queue_expire(%p,)\n", queue);
	960
	961	uint64_t cur_deadline = deadline;
	962	timer_queue_lock_spin(queue);
	963
	964	while (!queue_empty(&queue->head)) {
	965	/* Upon processing one or more timer calls, refresh the
	966	* deadline to account for time elapsed in the callout
	967	*/
	968	if (++tc_iterations > 1) {
	969	cur_deadline = mach_absolute_time();
	970	}
	971
	972	if (call == NULL) {
	973	call = TIMER_CALL(queue_first(&queue->head));
	974	}
	975
	976	if (call->soft_deadline <= cur_deadline) {
	977	timer_call_func_t func;
	978	timer_call_param_t param0, param1;
	979
	980	TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->soft_deadline, 0, 0, 0);
	981	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	982	DECR_TIMER_EXPIRE \| DBG_FUNC_NONE,
	983	VM_KERNEL_UNSLIDE_OR_PERM(call),
	984	call->soft_deadline,
	985	TCE(call)->deadline,
	986	TCE(call)->entry_time, 0);
	987
	988	if ((call->flags & TIMER_CALL_RATELIMITED) &&
	989	(TCE(call)->deadline > cur_deadline)) {
	990	if (rescan == FALSE) {
	991	break;
	992	}
	993	}
	994
	995	if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
	996	/* case (2b) lock inversion, dequeue and skip */
	997	timer_queue_expire_lock_skips++;
	998	timer_call_entry_dequeue_async(call);
	999	call = NULL;
	1000	continue;
	1001	}
	1002
	1003	timer_call_entry_dequeue(call);
	1004
	1005	func = TCE(call)->func;
	1006	param0 = TCE(call)->param0;
	1007	param1 = TCE(call)->param1;
	1008
	1009	simple_unlock(&call->lock);
	1010	timer_queue_unlock(queue);
	1011
	1012	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1013	DECR_TIMER_CALLOUT \| DBG_FUNC_START,
	1014	VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
	1015	VM_KERNEL_ADDRHIDE(param0),
	1016	VM_KERNEL_ADDRHIDE(param1),
	1017	0);
	1018
	1019	#if CONFIG_DTRACE
	1020	DTRACE_TMR7(callout__start, timer_call_func_t, func,
	1021	timer_call_param_t, param0, unsigned, call->flags,
	1022	0, (call->ttd >> 32),
	1023	(unsigned) (call->ttd & 0xFFFFFFFF), call);
	1024	#endif
	1025	/* Maintain time-to-deadline in per-processor data
	1026	* structure for thread wakeup deadline statistics.
	1027	*/
	1028	uint64_t *ttdp = &(PROCESSOR_DATA(current_processor(), timer_call_ttd));
	1029	*ttdp = call->ttd;
	1030	(*func)(param0, param1);
	1031	*ttdp = 0;
	1032	#if CONFIG_DTRACE
	1033	DTRACE_TMR4(callout__end, timer_call_func_t, func,
	1034	param0, param1, call);
	1035	#endif
	1036
	1037	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1038	DECR_TIMER_CALLOUT \| DBG_FUNC_END,
	1039	VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
	1040	VM_KERNEL_ADDRHIDE(param0),
	1041	VM_KERNEL_ADDRHIDE(param1),
	1042	0);
	1043	call = NULL;
	1044	timer_queue_lock_spin(queue);
	1045	} else {
	1046	if (__probable(rescan == FALSE)) {
	1047	break;
	1048	} else {
	1049	int64_t skew = TCE(call)->deadline - call->soft_deadline;
	1050	assert(TCE(call)->deadline >= call->soft_deadline);
	1051
	1052	/* DRK: On a latency quality-of-service level change,
	1053	* re-sort potentially rate-limited timers. The platform
	1054	* layer determines which timers require
	1055	* this. In the absence of the per-callout
	1056	* synchronization requirement, a global resort could
	1057	* be more efficient. The re-sort effectively
	1058	* annuls all timer adjustments, i.e. the "soft
	1059	* deadline" is the sort key.
	1060	*/
	1061
	1062	if (timer_resort_threshold(skew)) {
	1063	if (__probable(simple_lock_try(&call->lock, LCK_GRP_NULL))) {
	1064	timer_call_entry_dequeue(call);
	1065	timer_call_entry_enqueue_deadline(call, queue, call->soft_deadline);
	1066	simple_unlock(&call->lock);
	1067	call = NULL;
	1068	}
	1069	}
	1070	if (call) {
	1071	call = TIMER_CALL(queue_next(qe(call)));
	1072	if (queue_end(&queue->head, qe(call))) {
	1073	break;
	1074	}
	1075	}
	1076	}
	1077	}
	1078	}
	1079
	1080	if (!queue_empty(&queue->head)) {
	1081	call = TIMER_CALL(queue_first(&queue->head));
	1082	cur_deadline = TCE(call)->deadline;
	1083	queue->earliest_soft_deadline = (call->flags & TIMER_CALL_RATELIMITED) ? TCE(call)->deadline: call->soft_deadline;
	1084	} else {
	1085	queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
	1086	}
	1087
	1088	timer_queue_unlock(queue);
	1089
	1090	return cur_deadline;
	1091	}
	1092
	1093	uint64_t
	1094	timer_queue_expire(
	1095	mpqueue_head_t *queue,
	1096	uint64_t deadline)
	1097	{
	1098	return timer_queue_expire_with_options(queue, deadline, FALSE);
	1099	}
	1100
	1101	extern int serverperfmode;
	1102	static uint32_t timer_queue_migrate_lock_skips;
	1103	/*
	1104	* timer_queue_migrate() is called by timer_queue_migrate_cpu()
	1105	* to move timer requests from the local processor (queue_from)
	1106	* to a target processor's (queue_to).
	1107	*/
	1108	int
	1109	timer_queue_migrate(mpqueue_head_t queue_from, mpqueue_head_t queue_to)
	1110	{
	1111	timer_call_t call;
	1112	timer_call_t head_to;
	1113	int timers_migrated = 0;
	1114
	1115	DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
	1116
	1117	assert(!ml_get_interrupts_enabled());
	1118	assert(queue_from != queue_to);
	1119
	1120	if (serverperfmode) {
	1121	/*
	1122	* if we're running a high end server
	1123	* avoid migrations... they add latency
	1124	* and don't save us power under typical
	1125	* server workloads
	1126	*/
	1127	return -4;
	1128	}
	1129
	1130	/*
	1131	* Take both local (from) and target (to) timer queue locks while
	1132	* moving the timers from the local queue to the target processor.
	1133	* We assume that the target is always the boot processor.
	1134	* But only move if all of the following is true:
	1135	* - the target queue is non-empty
	1136	* - the local queue is non-empty
	1137	* - the local queue's first deadline is later than the target's
	1138	* - the local queue contains no non-migrateable "local" call
	1139	* so that we need not have the target resync.
	1140	*/
	1141
	1142	timer_queue_lock_spin(queue_to);
	1143
	1144	head_to = TIMER_CALL(queue_first(&queue_to->head));
	1145	if (queue_empty(&queue_to->head)) {
	1146	timers_migrated = -1;
	1147	goto abort1;
	1148	}
	1149
	1150	timer_queue_lock_spin(queue_from);
	1151
	1152	if (queue_empty(&queue_from->head)) {
	1153	timers_migrated = -2;
	1154	goto abort2;
	1155	}
	1156
	1157	call = TIMER_CALL(queue_first(&queue_from->head));
	1158	if (TCE(call)->deadline < TCE(head_to)->deadline) {
	1159	timers_migrated = 0;
	1160	goto abort2;
	1161	}
	1162
	1163	/* perform scan for non-migratable timers */
	1164	do {
	1165	if (call->flags & TIMER_CALL_LOCAL) {
	1166	timers_migrated = -3;
	1167	goto abort2;
	1168	}
	1169	call = TIMER_CALL(queue_next(qe(call)));
	1170	} while (!queue_end(&queue_from->head, qe(call)));
	1171
	1172	/* migration loop itself -- both queues are locked */
	1173	while (!queue_empty(&queue_from->head)) {
	1174	call = TIMER_CALL(queue_first(&queue_from->head));
	1175	if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
	1176	/* case (2b) lock order inversion, dequeue only */
	1177	#ifdef TIMER_ASSERT
	1178	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1179	DECR_TIMER_ASYNC_DEQ \| DBG_FUNC_NONE,
	1180	VM_KERNEL_UNSLIDE_OR_PERM(call),
	1181	VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
	1182	VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
	1183	0x2b, 0);
	1184	#endif
	1185	timer_queue_migrate_lock_skips++;
	1186	timer_call_entry_dequeue_async(call);
	1187	continue;
	1188	}
	1189	timer_call_entry_dequeue(call);
	1190	timer_call_entry_enqueue_deadline(
	1191	call, queue_to, TCE(call)->deadline);
	1192	timers_migrated++;
	1193	simple_unlock(&call->lock);
	1194	}
	1195	queue_from->earliest_soft_deadline = UINT64_MAX;
	1196	abort2:
	1197	timer_queue_unlock(queue_from);
	1198	abort1:
	1199	timer_queue_unlock(queue_to);
	1200
	1201	return timers_migrated;
	1202	}
	1203
	1204	void
	1205	timer_queue_trace_cpu(int ncpu)
	1206	{
	1207	timer_call_nosync_cpu(
	1208	ncpu,
	1209	(void ()(void ))timer_queue_trace,
	1210	(void*) timer_queue_cpu(ncpu));
	1211	}
	1212
	1213	void
	1214	timer_queue_trace(
	1215	mpqueue_head_t *queue)
	1216	{
	1217	timer_call_t call;
	1218	spl_t s;
	1219
	1220	if (!kdebug_enable) {
	1221	return;
	1222	}
	1223
	1224	s = splclock();
	1225	timer_queue_lock_spin(queue);
	1226
	1227	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1228	DECR_TIMER_QUEUE \| DBG_FUNC_START,
	1229	queue->count, mach_absolute_time(), 0, 0, 0);
	1230
	1231	if (!queue_empty(&queue->head)) {
	1232	call = TIMER_CALL(queue_first(&queue->head));
	1233	do {
	1234	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1235	DECR_TIMER_QUEUE \| DBG_FUNC_NONE,
	1236	call->soft_deadline,
	1237	TCE(call)->deadline,
	1238	TCE(call)->entry_time,
	1239	VM_KERNEL_UNSLIDE(TCE(call)->func),
	1240	0);
	1241	call = TIMER_CALL(queue_next(qe(call)));
	1242	} while (!queue_end(&queue->head, qe(call)));
	1243	}
	1244
	1245	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1246	DECR_TIMER_QUEUE \| DBG_FUNC_END,
	1247	queue->count, mach_absolute_time(), 0, 0, 0);
	1248
	1249	timer_queue_unlock(queue);
	1250	splx(s);
	1251	}
	1252
	1253	void
	1254	timer_longterm_dequeued_locked(timer_call_t call)
	1255	{
	1256	timer_longterm_t *tlp = &timer_longterm;
	1257
	1258	tlp->dequeues++;
	1259	if (call == tlp->threshold.call) {
	1260	tlp->threshold.call = NULL;
	1261	}
	1262	}
	1263
	1264	/*
	1265	* Place a timer call in the longterm list
	1266	* and adjust the next timer callout deadline if the new timer is first.
	1267	*/
	1268	mpqueue_head_t *
	1269	timer_longterm_enqueue_unlocked(timer_call_t call,
	1270	uint64_t now,
	1271	uint64_t deadline,
	1272	mpqueue_head_t **old_queue,
	1273	uint64_t soft_deadline,
	1274	uint64_t ttd,
	1275	timer_call_param_t param1,
	1276	uint32_t callout_flags)
	1277	{
	1278	timer_longterm_t *tlp = &timer_longterm;
	1279	boolean_t update_required = FALSE;
	1280	uint64_t longterm_threshold;
	1281
	1282	longterm_threshold = now + tlp->threshold.interval;
	1283
	1284	/*
	1285	* Return NULL without doing anything if:
	1286	* - this timer is local, or
	1287	* - the longterm mechanism is disabled, or
	1288	* - this deadline is too short.
	1289	*/
	1290	if ((callout_flags & TIMER_CALL_LOCAL) != 0 \|\|
	1291	(tlp->threshold.interval == TIMER_LONGTERM_NONE) \|\|
	1292	(deadline <= longterm_threshold)) {
	1293	return NULL;
	1294	}
	1295
	1296	/*
	1297	* Remove timer from its current queue, if any.
	1298	*/
	1299	*old_queue = timer_call_dequeue_unlocked(call);
	1300
	1301	/*
	1302	* Lock the longterm queue, queue timer and determine
	1303	* whether an update is necessary.
	1304	*/
	1305	assert(!ml_get_interrupts_enabled());
	1306	simple_lock(&call->lock, LCK_GRP_NULL);
	1307	timer_queue_lock_spin(timer_longterm_queue);
	1308	TCE(call)->deadline = deadline;
	1309	TCE(call)->param1 = param1;
	1310	call->ttd = ttd;
	1311	call->soft_deadline = soft_deadline;
	1312	call->flags = callout_flags;
	1313	timer_call_entry_enqueue_tail(call, timer_longterm_queue);
	1314
	1315	tlp->enqueues++;
	1316
	1317	/*
	1318	* We'll need to update the currently set threshold timer
	1319	* if the new deadline is sooner and no sooner update is in flight.
	1320	*/
	1321	if (deadline < tlp->threshold.deadline &&
	1322	deadline < tlp->threshold.preempted) {
	1323	tlp->threshold.preempted = deadline;
	1324	tlp->threshold.call = call;
	1325	update_required = TRUE;
	1326	}
	1327	timer_queue_unlock(timer_longterm_queue);
	1328	simple_unlock(&call->lock);
	1329
	1330	if (update_required) {
	1331	/*
	1332	* Note: this call expects that calling the master cpu
	1333	* alone does not involve locking the topo lock.
	1334	*/
	1335	timer_call_nosync_cpu(
	1336	master_cpu,
	1337	(void ()(void ))timer_longterm_update,
	1338	(void *)tlp);
	1339	}
	1340
	1341	return timer_longterm_queue;
	1342	}
	1343
	1344	/*
	1345	* Scan for timers below the longterm threshold.
	1346	* Move these to the local timer queue (of the boot processor on which the
	1347	* calling thread is running).
	1348	* Both the local (boot) queue and the longterm queue are locked.
	1349	* The scan is similar to the timer migrate sequence but is performed by
	1350	* successively examining each timer on the longterm queue:
	1351	* - if within the short-term threshold
	1352	* - enter on the local queue (unless being deleted),
	1353	* - otherwise:
	1354	* - if sooner, deadline becomes the next threshold deadline.
	1355	* The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
	1356	* exceeded, we abort and reschedule again so that we don't shut others from
	1357	* the timer queues. Longterm timers firing late is not critical.
	1358	*/
	1359	void
	1360	timer_longterm_scan(timer_longterm_t *tlp,
	1361	uint64_t time_start)
	1362	{
	1363	queue_entry_t qe;
	1364	timer_call_t call;
	1365	uint64_t threshold;
	1366	uint64_t deadline;
	1367	uint64_t time_limit = time_start + tlp->scan_limit;
	1368	mpqueue_head_t *timer_master_queue;
	1369
	1370	assert(!ml_get_interrupts_enabled());
	1371	assert(cpu_number() == master_cpu);
	1372
	1373	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
	1374	threshold = time_start + tlp->threshold.interval;
	1375	}
	1376
	1377	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
	1378	tlp->threshold.call = NULL;
	1379
	1380	if (queue_empty(&timer_longterm_queue->head)) {
	1381	return;
	1382	}
	1383
	1384	timer_master_queue = timer_queue_cpu(master_cpu);
	1385	timer_queue_lock_spin(timer_master_queue);
	1386
	1387	qe = queue_first(&timer_longterm_queue->head);
	1388	while (!queue_end(&timer_longterm_queue->head, qe)) {
	1389	call = TIMER_CALL(qe);
	1390	deadline = call->soft_deadline;
	1391	qe = queue_next(qe);
	1392	if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
	1393	/* case (2c) lock order inversion, dequeue only */
	1394	#ifdef TIMER_ASSERT
	1395	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1396	DECR_TIMER_ASYNC_DEQ \| DBG_FUNC_NONE,
	1397	VM_KERNEL_UNSLIDE_OR_PERM(call),
	1398	VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
	1399	VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
	1400	0x2c, 0);
	1401	#endif
	1402	timer_call_entry_dequeue_async(call);
	1403	continue;
	1404	}
	1405	if (deadline < threshold) {
	1406	/*
	1407	* This timer needs moving (escalating)
	1408	* to the local (boot) processor's queue.
	1409	*/
	1410	#ifdef TIMER_ASSERT
	1411	if (deadline < time_start) {
	1412	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1413	DECR_TIMER_OVERDUE \| DBG_FUNC_NONE,
	1414	VM_KERNEL_UNSLIDE_OR_PERM(call),
	1415	deadline,
	1416	time_start,
	1417	threshold,
	1418	0);
	1419	}
	1420	#endif
	1421	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1422	DECR_TIMER_ESCALATE \| DBG_FUNC_NONE,
	1423	VM_KERNEL_UNSLIDE_OR_PERM(call),
	1424	TCE(call)->deadline,
	1425	TCE(call)->entry_time,
	1426	VM_KERNEL_UNSLIDE(TCE(call)->func),
	1427	0);
	1428	tlp->escalates++;
	1429	timer_call_entry_dequeue(call);
	1430	timer_call_entry_enqueue_deadline(
	1431	call, timer_master_queue, TCE(call)->deadline);
	1432	/*
	1433	* A side-effect of the following call is to update
	1434	* the actual hardware deadline if required.
	1435	*/
	1436	(void) timer_queue_assign(deadline);
	1437	} else {
	1438	if (deadline < tlp->threshold.deadline) {
	1439	tlp->threshold.deadline = deadline;
	1440	tlp->threshold.call = call;
	1441	}
	1442	}
	1443	simple_unlock(&call->lock);
	1444
	1445	/* Abort scan if we're taking too long. */
	1446	if (mach_absolute_time() > time_limit) {
	1447	tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
	1448	tlp->scan_pauses++;
	1449	DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
	1450	time_limit, tlp->queue.count);
	1451	break;
	1452	}
	1453	}
	1454
	1455	timer_queue_unlock(timer_master_queue);
	1456	}
	1457
	1458	void
	1459	timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
	1460	{
	1461	timer_longterm_t tlp = (timer_longterm_t ) p0;
	1462
	1463	timer_longterm_update(tlp);
	1464	}
	1465
	1466	void
	1467	timer_longterm_update_locked(timer_longterm_t *tlp)
	1468	{
	1469	uint64_t latency;
	1470
	1471	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1472	DECR_TIMER_UPDATE \| DBG_FUNC_START,
	1473	VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
	1474	tlp->threshold.deadline,
	1475	tlp->threshold.preempted,
	1476	tlp->queue.count, 0);
	1477
	1478	tlp->scan_time = mach_absolute_time();
	1479	if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
	1480	tlp->threshold.preempts++;
	1481	tlp->threshold.deadline = tlp->threshold.preempted;
	1482	tlp->threshold.preempted = TIMER_LONGTERM_NONE;
	1483	/*
	1484	* Note: in the unlikely event that a pre-empted timer has
	1485	* itself been cancelled, we'll simply re-scan later at the
	1486	* time of the preempted/cancelled timer.
	1487	*/
	1488	} else {
	1489	tlp->threshold.scans++;
	1490
	1491	/*
	1492	* Maintain a moving average of our wakeup latency.
	1493	* Clamp latency to 0 and ignore above threshold interval.
	1494	*/
	1495	if (tlp->scan_time > tlp->threshold.deadline_set) {
	1496	latency = tlp->scan_time - tlp->threshold.deadline_set;
	1497	} else {
	1498	latency = 0;
	1499	}
	1500	if (latency < tlp->threshold.interval) {
	1501	tlp->threshold.latency_min =
	1502	MIN(tlp->threshold.latency_min, latency);
	1503	tlp->threshold.latency_max =
	1504	MAX(tlp->threshold.latency_max, latency);
	1505	tlp->threshold.latency =
	1506	(tlp->threshold.latency * 99 + latency) / 100;
	1507	}
	1508
	1509	timer_longterm_scan(tlp, tlp->scan_time);
	1510	}
	1511
	1512	tlp->threshold.deadline_set = tlp->threshold.deadline;
	1513	/* The next deadline timer to be set is adjusted */
	1514	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
	1515	tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
	1516	tlp->threshold.deadline_set -= tlp->threshold.margin;
	1517	tlp->threshold.deadline_set -= tlp->threshold.latency;
	1518	}
	1519
	1520	/* Throttle next scan time */
	1521	uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
	1522	if (tlp->threshold.deadline_set < scan_clamp) {
	1523	tlp->threshold.deadline_set = scan_clamp;
	1524	}
	1525
	1526	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
	1527	DECR_TIMER_UPDATE \| DBG_FUNC_END,
	1528	VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
	1529	tlp->threshold.deadline,
	1530	tlp->threshold.scans,
	1531	tlp->queue.count, 0);
	1532	}
	1533
	1534	void
	1535	timer_longterm_update(timer_longterm_t *tlp)
	1536	{
	1537	spl_t s = splclock();
	1538
	1539	timer_queue_lock_spin(timer_longterm_queue);
	1540
	1541	if (cpu_number() != master_cpu) {
	1542	panic("timer_longterm_update_master() on non-boot cpu");
	1543	}
	1544
	1545	timer_longterm_update_locked(tlp);
	1546
	1547	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
	1548	timer_call_enter(
	1549	&tlp->threshold.timer,
	1550	tlp->threshold.deadline_set,
	1551	TIMER_CALL_LOCAL \| TIMER_CALL_SYS_CRITICAL);
	1552	}
	1553
	1554	timer_queue_unlock(timer_longterm_queue);
	1555	splx(s);
	1556	}
	1557
	1558	void
	1559	timer_longterm_init(void)
	1560	{
	1561	uint32_t longterm;
	1562	timer_longterm_t *tlp = &timer_longterm;
	1563
	1564	DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
	1565
	1566	/*
	1567	* Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
	1568	* or TIMER_LONGTERM_NONE (disabled) for server;
	1569	* overridden longterm boot-arg
	1570	*/
	1571	tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
	1572	: TIMER_LONGTERM_THRESHOLD;
	1573	if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
	1574	tlp->threshold.interval = (longterm == 0) ?
	1575	TIMER_LONGTERM_NONE :
	1576	longterm * NSEC_PER_MSEC;
	1577	}
	1578	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
	1579	printf("Longterm timer threshold: %llu ms\n",
	1580	tlp->threshold.interval / NSEC_PER_MSEC);
	1581	kprintf("Longterm timer threshold: %llu ms\n",
	1582	tlp->threshold.interval / NSEC_PER_MSEC);
	1583	nanoseconds_to_absolutetime(tlp->threshold.interval,
	1584	&tlp->threshold.interval);
	1585	tlp->threshold.margin = tlp->threshold.interval / 10;
	1586	tlp->threshold.latency_min = EndOfAllTime;
	1587	tlp->threshold.latency_max = 0;
	1588	}
	1589
	1590	tlp->threshold.preempted = TIMER_LONGTERM_NONE;
	1591	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
	1592
	1593	lck_attr_setdefault(&timer_longterm_lck_attr);
	1594	lck_grp_attr_setdefault(&timer_longterm_lck_grp_attr);
	1595	lck_grp_init(&timer_longterm_lck_grp,
	1596	"timer_longterm", &timer_longterm_lck_grp_attr);
	1597	mpqueue_init(&tlp->queue,
	1598	&timer_longterm_lck_grp, &timer_longterm_lck_attr);
	1599
	1600	timer_call_setup(&tlp->threshold.timer,
	1601	timer_longterm_callout, (timer_call_param_t) tlp);
	1602
	1603	timer_longterm_queue = &tlp->queue;
	1604	}
	1605
	1606	enum {
	1607	THRESHOLD, QCOUNT,
	1608	ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
	1609	LATENCY, LATENCY_MIN, LATENCY_MAX, SCAN_LIMIT, SCAN_INTERVAL, PAUSES
	1610	};
	1611	uint64_t
	1612	timer_sysctl_get(int oid)
	1613	{
	1614	timer_longterm_t *tlp = &timer_longterm;
	1615
	1616	switch (oid) {
	1617	case THRESHOLD:
	1618	return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
	1619	0 : tlp->threshold.interval / NSEC_PER_MSEC;
	1620	case QCOUNT:
	1621	return tlp->queue.count;
	1622	case ENQUEUES:
	1623	return tlp->enqueues;
	1624	case DEQUEUES:
	1625	return tlp->dequeues;
	1626	case ESCALATES:
	1627	return tlp->escalates;
	1628	case SCANS:
	1629	return tlp->threshold.scans;
	1630	case PREEMPTS:
	1631	return tlp->threshold.preempts;
	1632	case LATENCY:
	1633	return tlp->threshold.latency;
	1634	case LATENCY_MIN:
	1635	return tlp->threshold.latency_min;
	1636	case LATENCY_MAX:
	1637	return tlp->threshold.latency_max;
	1638	case SCAN_LIMIT:
	1639	return tlp->scan_limit;
	1640	case SCAN_INTERVAL:
	1641	return tlp->scan_interval;
	1642	case PAUSES:
	1643	return tlp->scan_pauses;
	1644	default:
	1645	return 0;
	1646	}
	1647	}
	1648
	1649	/*
	1650	* timer_master_scan() is the inverse of timer_longterm_scan()
	1651	* since it un-escalates timers to the longterm queue.
	1652	*/
	1653	static void
	1654	timer_master_scan(timer_longterm_t *tlp,
	1655	uint64_t now)
	1656	{
	1657	queue_entry_t qe;
	1658	timer_call_t call;
	1659	uint64_t threshold;
	1660	uint64_t deadline;
	1661	mpqueue_head_t *timer_master_queue;
	1662
	1663	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
	1664	threshold = now + tlp->threshold.interval;
	1665	} else {
	1666	threshold = TIMER_LONGTERM_NONE;
	1667	}
	1668
	1669	timer_master_queue = timer_queue_cpu(master_cpu);
	1670	timer_queue_lock_spin(timer_master_queue);
	1671
	1672	qe = queue_first(&timer_master_queue->head);
	1673	while (!queue_end(&timer_master_queue->head, qe)) {
	1674	call = TIMER_CALL(qe);
	1675	deadline = TCE(call)->deadline;
	1676	qe = queue_next(qe);
	1677	if ((call->flags & TIMER_CALL_LOCAL) != 0) {
	1678	continue;
	1679	}
	1680	if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
	1681	/* case (2c) lock order inversion, dequeue only */
	1682	timer_call_entry_dequeue_async(call);
	1683	continue;
	1684	}
	1685	if (deadline > threshold) {
	1686	/* move from master to longterm */
	1687	timer_call_entry_dequeue(call);
	1688	timer_call_entry_enqueue_tail(call, timer_longterm_queue);
	1689	if (deadline < tlp->threshold.deadline) {
	1690	tlp->threshold.deadline = deadline;
	1691	tlp->threshold.call = call;
	1692	}
	1693	}
	1694	simple_unlock(&call->lock);
	1695	}
	1696	timer_queue_unlock(timer_master_queue);
	1697	}
	1698
	1699	static void
	1700	timer_sysctl_set_threshold(uint64_t value)
	1701	{
	1702	timer_longterm_t *tlp = &timer_longterm;
	1703	spl_t s = splclock();
	1704	boolean_t threshold_increase;
	1705
	1706	timer_queue_lock_spin(timer_longterm_queue);
	1707
	1708	timer_call_cancel(&tlp->threshold.timer);
	1709
	1710	/*
	1711	* Set the new threshold and note whther it's increasing.
	1712	*/
	1713	if (value == 0) {
	1714	tlp->threshold.interval = TIMER_LONGTERM_NONE;
	1715	threshold_increase = TRUE;
	1716	timer_call_cancel(&tlp->threshold.timer);
	1717	} else {
	1718	uint64_t old_interval = tlp->threshold.interval;
	1719	tlp->threshold.interval = value * NSEC_PER_MSEC;
	1720	nanoseconds_to_absolutetime(tlp->threshold.interval,
	1721	&tlp->threshold.interval);
	1722	tlp->threshold.margin = tlp->threshold.interval / 10;
	1723	if (old_interval == TIMER_LONGTERM_NONE) {
	1724	threshold_increase = FALSE;
	1725	} else {
	1726	threshold_increase = (tlp->threshold.interval > old_interval);
	1727	}
	1728	}
	1729
	1730	if (threshold_increase /* or removal */) {
	1731	/* Escalate timers from the longterm queue */
	1732	timer_longterm_scan(tlp, mach_absolute_time());
	1733	} else { /* decrease or addition */
	1734	/*
	1735	* We scan the local/master queue for timers now longterm.
	1736	* To be strictly correct, we should scan all processor queues
	1737	* but timer migration results in most timers gravitating to the
	1738	* master processor in any case.
	1739	*/
	1740	timer_master_scan(tlp, mach_absolute_time());
	1741	}
	1742
	1743	/* Set new timer accordingly */
	1744	tlp->threshold.deadline_set = tlp->threshold.deadline;
	1745	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
	1746	tlp->threshold.deadline_set -= tlp->threshold.margin;
	1747	tlp->threshold.deadline_set -= tlp->threshold.latency;
	1748	timer_call_enter(
	1749	&tlp->threshold.timer,
	1750	tlp->threshold.deadline_set,
	1751	TIMER_CALL_LOCAL \| TIMER_CALL_SYS_CRITICAL);
	1752	}
	1753
	1754	/* Reset stats */
	1755	tlp->enqueues = 0;
	1756	tlp->dequeues = 0;
	1757	tlp->escalates = 0;
	1758	tlp->scan_pauses = 0;
	1759	tlp->threshold.scans = 0;
	1760	tlp->threshold.preempts = 0;
	1761	tlp->threshold.latency = 0;
	1762	tlp->threshold.latency_min = EndOfAllTime;
	1763	tlp->threshold.latency_max = 0;
	1764
	1765	timer_queue_unlock(timer_longterm_queue);
	1766	splx(s);
	1767	}
	1768
	1769	int
	1770	timer_sysctl_set(int oid, uint64_t value)
	1771	{
	1772	switch (oid) {
	1773	case THRESHOLD:
	1774	timer_call_cpu(
	1775	master_cpu,
	1776	(void ()(void ))timer_sysctl_set_threshold,
	1777	(void *) value);
	1778	return KERN_SUCCESS;
	1779	case SCAN_LIMIT:
	1780	timer_longterm.scan_limit = value;
	1781	return KERN_SUCCESS;
	1782	case SCAN_INTERVAL:
	1783	timer_longterm.scan_interval = value;
	1784	return KERN_SUCCESS;
	1785	default:
	1786	return KERN_INVALID_ARGUMENT;
	1787	}
	1788	}
	1789
	1790
	1791	/* Select timer coalescing window based on per-task quality-of-service hints */
	1792	static boolean_t
	1793	tcoal_qos_adjust(thread_t t, int32_t tshift, uint64_t tmax_abstime, boolean_t *pratelimited)
	1794	{
	1795	uint32_t latency_qos;
	1796	boolean_t adjusted = FALSE;
	1797	task_t ctask = t->task;
	1798
	1799	if (ctask) {
	1800	latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
	1801
	1802	assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
	1803
	1804	if (latency_qos) {
	1805	*tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
	1806	*tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
	1807	*pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
	1808	adjusted = TRUE;
	1809	}
	1810	}
	1811	return adjusted;
	1812	}
	1813
	1814
	1815	/* Adjust timer deadlines based on priority of the thread and the
	1816	* urgency value provided at timeout establishment. With this mechanism,
	1817	* timers are no longer necessarily sorted in order of soft deadline
	1818	* on a given timer queue, i.e. they may be differentially skewed.
	1819	* In the current scheme, this could lead to fewer pending timers
	1820	* processed than is technically possible when the HW deadline arrives.
	1821	*/
	1822	static void
	1823	timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t tshift, uint64_t tmax_abstime, boolean_t *pratelimited)
	1824	{
	1825	int16_t tpri = cthread->sched_pri;
	1826	if ((urgency & TIMER_CALL_USER_MASK) != 0) {
	1827	if (tpri >= BASEPRI_RTQUEUES \|\|
	1828	urgency == TIMER_CALL_USER_CRITICAL) {
	1829	*tshift = tcoal_prio_params.timer_coalesce_rt_shift;
	1830	*tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
	1831	TCOAL_PRIO_STAT(rt_tcl);
	1832	} else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) \|\|
	1833	(urgency == TIMER_CALL_USER_BACKGROUND)) {
	1834	/* Determine if timer should be subjected to a lower QoS */
	1835	if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
	1836	if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
	1837	return;
	1838	} else {
	1839	*pratelimited = FALSE;
	1840	}
	1841	}
	1842	*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
	1843	*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
	1844	TCOAL_PRIO_STAT(bg_tcl);
	1845	} else if (tpri >= MINPRI_KERNEL) {
	1846	*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
	1847	*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
	1848	TCOAL_PRIO_STAT(kt_tcl);
	1849	} else if (cthread->sched_mode == TH_MODE_FIXED) {
	1850	*tshift = tcoal_prio_params.timer_coalesce_fp_shift;
	1851	*tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
	1852	TCOAL_PRIO_STAT(fp_tcl);
	1853	} else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
	1854	TCOAL_PRIO_STAT(qos_tcl);
	1855	} else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
	1856	*tshift = tcoal_prio_params.timer_coalesce_ts_shift;
	1857	*tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
	1858	TCOAL_PRIO_STAT(ts_tcl);
	1859	} else {
	1860	TCOAL_PRIO_STAT(nc_tcl);
	1861	}
	1862	} else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
	1863	*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
	1864	*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
	1865	TCOAL_PRIO_STAT(bg_tcl);
	1866	} else {
	1867	*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
	1868	*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
	1869	TCOAL_PRIO_STAT(kt_tcl);
	1870	}
	1871	}
	1872
	1873
	1874	int timer_user_idle_level;
	1875
	1876	uint64_t
	1877	timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
	1878	{
	1879	int32_t tcs_shift = 0;
	1880	uint64_t tcs_max_abstime = 0;
	1881	uint64_t adjval;
	1882	uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
	1883
	1884	if (mach_timer_coalescing_enabled &&
	1885	(deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
	1886	timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
	1887
	1888	if (tcs_shift >= 0) {
	1889	adjval = MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
	1890	} else {
	1891	adjval = MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
	1892	}
	1893	/* Apply adjustments derived from "user idle level" heuristic */
	1894	adjval += (adjval * timer_user_idle_level) >> 7;
	1895	return adjval;
	1896	} else {
	1897	return 0;
	1898	}
	1899	}
	1900
	1901	int
	1902	timer_get_user_idle_level(void)
	1903	{
	1904	return timer_user_idle_level;
	1905	}
	1906
	1907	kern_return_t
	1908	timer_set_user_idle_level(int ilevel)
	1909	{
	1910	boolean_t do_reeval = FALSE;
	1911
	1912	if ((ilevel < 0) \|\| (ilevel > 128)) {
	1913	return KERN_INVALID_ARGUMENT;
	1914	}
	1915
	1916	if (ilevel < timer_user_idle_level) {
	1917	do_reeval = TRUE;
	1918	}
	1919
	1920	timer_user_idle_level = ilevel;
	1921
	1922	if (do_reeval) {
	1923	ml_timer_evaluate();
	1924	}
	1925
	1926	return KERN_SUCCESS;
	1927	}