2 * Copyright (c) 2011-2018 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 * Profile Every Thread (PET) provides a profile of all threads on the system
31 * when a timer fires. PET supports the "record waiting threads" mode in
32 * Instruments, and used to be called All Thread States (ATS). New tools should
33 * adopt the lightweight PET mode, which provides the same information, but with
36 * When traditional (non-lightweight) PET is active, a migrating timer call
37 * causes the PET thread to wake up. The timer handler also issues a broadcast
38 * IPI to the other CPUs, to provide a (somewhat) synchronized set of on-core
39 * samples. This is provided for backwards-compatibility with clients that
40 * expect on-core samples, when PET's timer was based off the on-core timers.
41 * Because PET sampling can take on the order of milliseconds, the PET thread
42 * will enter a new timer deadline after it finished sampling This perturbs the
43 * timer cadence by the duration of PET sampling, but it leaves the system to
44 * work on non-profiling tasks for the duration of the timer period.
46 * Lightweight PET samples the system less-intrusively than normal PET
47 * mode. Instead of iterating tasks and threads on each sample, it increments
48 * a global generation count, `kppet_gencount`, which is checked as threads are
49 * context switched on-core. If the thread's local generation count is older
50 * than the global generation, the thread samples itself.
53 * thread A +--+---------|
55 * thread B |--+---------------|
57 * thread C | | |-------------------------------------
59 * thread D | | | |-------------------------------
61 * +--+---------+-----+--------------------------------> time
63 * | +-----+--- threads sampled when they come on-core in
64 * | kperf_pet_switch_context
66 * +--- PET timer fire, sample on-core threads A and B,
67 * increment kppet_gencount
70 #include <mach/mach_types.h>
71 #include <sys/errno.h>
73 #include <kperf/kperf.h>
74 #include <kperf/buffer.h>
75 #include <kperf/sample.h>
76 #include <kperf/context.h>
77 #include <kperf/action.h>
78 #include <kperf/pet.h>
79 #include <kperf/kptimer.h>
81 #include <kern/task.h>
82 #include <kern/kalloc.h>
83 #if defined(__x86_64__)
85 #endif /* defined(__x86_64__) */
87 static LCK_MTX_DECLARE(kppet_mtx
, &kperf_lck_grp
);
90 unsigned int g_actionid
;
92 * The idle rate controls how many sampling periods to skip if a thread
98 struct kperf_sample
*g_sample
;
100 thread_t g_sample_thread
;
103 * Used by the PET thread to manage which threads and tasks to sample.
106 unsigned int g_nthreads
;
107 size_t g_threads_size
;
110 unsigned int g_ntasks
;
114 .g_idle_rate
= KPERF_PET_DEFAULT_IDLE_RATE
,
117 bool kppet_lightweight_active
= false;
118 _Atomic
uint32_t kppet_gencount
= 0;
120 static uint64_t kppet_sample_tasks(uint32_t idle_rate
);
121 static void kppet_thread(void * param
, wait_result_t wr
);
124 kppet_lock_assert_owned(void)
126 lck_mtx_assert(&kppet_mtx
, LCK_MTX_ASSERT_OWNED
);
132 lck_mtx_lock(&kppet_mtx
);
138 lck_mtx_unlock(&kppet_mtx
);
142 kppet_on_cpu(thread_t thread
, thread_continue_t continuation
,
143 uintptr_t *starting_fp
)
145 assert(thread
!= NULL
);
146 assert(ml_get_interrupts_enabled() == FALSE
);
148 uint32_t actionid
= kppet
.g_actionid
;
153 if (thread
->kperf_pet_gen
!= atomic_load(&kppet_gencount
)) {
154 BUF_VERB(PERF_PET_SAMPLE_THREAD
| DBG_FUNC_START
,
155 atomic_load_explicit(&kppet_gencount
,
156 memory_order_relaxed
), thread
->kperf_pet_gen
);
158 task_t task
= get_threadtask(thread
);
159 struct kperf_context ctx
= {
160 .cur_thread
= thread
,
162 .cur_pid
= task_pid(task
),
163 .starting_fp
= starting_fp
,
166 * Use a per-CPU interrupt buffer, since this is only called
167 * while interrupts are disabled, from the scheduler.
169 struct kperf_sample
*sample
= kperf_intr_sample_buffer();
171 BUF_VERB(PERF_PET_SAMPLE_THREAD
| DBG_FUNC_END
, 1);
175 unsigned int flags
= SAMPLE_FLAG_NON_INTERRUPT
| SAMPLE_FLAG_PEND_USER
;
176 if (continuation
!= NULL
) {
177 flags
|= SAMPLE_FLAG_CONTINUATION
;
179 kperf_sample(sample
, &ctx
, actionid
, flags
);
181 BUF_VERB(PERF_PET_SAMPLE_THREAD
| DBG_FUNC_END
);
183 BUF_VERB(PERF_PET_SAMPLE_THREAD
,
184 os_atomic_load(&kppet_gencount
, relaxed
), thread
->kperf_pet_gen
);
188 #pragma mark - state transitions
191 * Lazily initialize PET. The PET thread never exits once PET has been used
201 kern_return_t kr
= kernel_thread_start(kppet_thread
, NULL
,
202 &kppet
.g_sample_thread
);
203 if (kr
!= KERN_SUCCESS
) {
204 panic("kperf: failed to create PET thread %d", kr
);
207 thread_set_thread_name(kppet
.g_sample_thread
, "kperf-pet-sampling");
208 kppet
.g_setup
= true;
212 kppet_config(unsigned int actionid
)
215 * Resetting kperf shouldn't get the PET thread started.
217 if (actionid
== 0 && !kppet
.g_setup
) {
225 kppet
.g_actionid
= actionid
;
228 if (!kppet
.g_sample
) {
229 kppet
.g_sample
= kalloc_tag(sizeof(*kppet
.g_sample
),
230 VM_KERN_MEMORY_DIAG
);
234 assert(kppet
.g_tasks_size
!= 0);
235 kfree(kppet
.g_tasks
, kppet
.g_tasks_size
);
236 kppet
.g_tasks
= NULL
;
237 kppet
.g_tasks_size
= 0;
240 if (kppet
.g_threads
) {
241 assert(kppet
.g_threads_size
!= 0);
242 kfree(kppet
.g_threads
, kppet
.g_threads_size
);
243 kppet
.g_threads
= NULL
;
244 kppet
.g_threads_size
= 0;
245 kppet
.g_nthreads
= 0;
247 if (kppet
.g_sample
!= NULL
) {
248 kfree(kppet
.g_sample
, sizeof(*kppet
.g_sample
));
249 kppet
.g_sample
= NULL
;
260 kppet_set_idle_rate(KPERF_PET_DEFAULT_IDLE_RATE
);
261 kppet_set_lightweight_pet(0);
265 kppet_wake_thread(void)
267 thread_wakeup(&kppet
);
270 __attribute__((noreturn
))
272 kppet_thread(void * __unused param
, wait_result_t __unused wr
)
277 BUF_INFO(PERF_PET_IDLE
);
280 (void)lck_mtx_sleep(&kppet_mtx
, LCK_SLEEP_DEFAULT
, &kppet
,
282 } while (kppet
.g_actionid
== 0);
284 BUF_INFO(PERF_PET_RUN
);
286 uint64_t sampledur_abs
= kppet_sample_tasks(kppet
.g_idle_rate
);
288 kptimer_pet_enter(sampledur_abs
);
292 #pragma mark - sampling
295 kppet_sample_thread(int pid
, task_t task
, thread_t thread
, uint32_t idle_rate
)
297 kppet_lock_assert_owned();
299 uint32_t sample_flags
= SAMPLE_FLAG_IDLE_THREADS
|
300 SAMPLE_FLAG_THREAD_ONLY
;
302 BUF_VERB(PERF_PET_SAMPLE_THREAD
| DBG_FUNC_START
);
304 struct kperf_context ctx
= {
305 .cur_thread
= thread
,
310 boolean_t thread_dirty
= kperf_thread_get_dirty(thread
);
313 * Clean a dirty thread and skip callstack sample if the thread was not
314 * dirty and thread had skipped less than `idle_rate` samples.
317 kperf_thread_set_dirty(thread
, FALSE
);
318 } else if ((thread
->kperf_pet_cnt
% idle_rate
) != 0) {
319 sample_flags
|= SAMPLE_FLAG_EMPTY_CALLSTACK
;
321 thread
->kperf_pet_cnt
++;
323 kperf_sample(kppet
.g_sample
, &ctx
, kppet
.g_actionid
, sample_flags
);
324 kperf_sample_user(&kppet
.g_sample
->usample
, &ctx
, kppet
.g_actionid
,
327 BUF_VERB(PERF_PET_SAMPLE_THREAD
| DBG_FUNC_END
);
331 kppet_threads_prepare(task_t task
)
333 kppet_lock_assert_owned();
335 vm_size_t threads_size_needed
;
346 * With the task locked, figure out if enough space has been allocated to
347 * contain all of the thread references.
349 threads_size_needed
= task
->thread_count
* sizeof(thread_t
);
350 if (threads_size_needed
<= kppet
.g_threads_size
) {
355 * Otherwise, allocate more and try again.
359 if (kppet
.g_threads_size
!= 0) {
360 kfree(kppet
.g_threads
, kppet
.g_threads_size
);
363 assert(threads_size_needed
> 0);
364 kppet
.g_threads_size
= threads_size_needed
;
366 kppet
.g_threads
= kalloc_tag(kppet
.g_threads_size
, VM_KERN_MEMORY_DIAG
);
367 if (kppet
.g_threads
== NULL
) {
368 kppet
.g_threads_size
= 0;
369 return KERN_RESOURCE_SHORTAGE
;
374 kppet
.g_nthreads
= 0;
375 queue_iterate(&(task
->threads
), thread
, thread_t
, task_threads
) {
376 thread_reference_internal(thread
);
377 kppet
.g_threads
[kppet
.g_nthreads
++] = thread
;
382 return (kppet
.g_nthreads
> 0) ? KERN_SUCCESS
: KERN_FAILURE
;
386 * Sample a `task`, using `idle_rate` to control whether idle threads need to be
389 * The task must be referenced.
392 kppet_sample_task(task_t task
, uint32_t idle_rate
)
394 kppet_lock_assert_owned();
395 assert(task
!= kernel_task
);
396 if (task
== kernel_task
) {
400 BUF_VERB(PERF_PET_SAMPLE_TASK
| DBG_FUNC_START
);
402 int pid
= task_pid(task
);
403 if (kperf_action_has_task(kppet
.g_actionid
)) {
404 struct kperf_context ctx
= {
409 kperf_sample(kppet
.g_sample
, &ctx
, kppet
.g_actionid
,
410 SAMPLE_FLAG_TASK_ONLY
);
413 if (!kperf_action_has_thread(kppet
.g_actionid
)) {
414 BUF_VERB(PERF_PET_SAMPLE_TASK
| DBG_FUNC_END
);
419 * Suspend the task to see an atomic snapshot of all its threads. This
420 * is expensive and disruptive.
422 kern_return_t kr
= task_suspend_internal(task
);
423 if (kr
!= KERN_SUCCESS
) {
424 BUF_VERB(PERF_PET_SAMPLE_TASK
| DBG_FUNC_END
, 1);
428 kr
= kppet_threads_prepare(task
);
429 if (kr
!= KERN_SUCCESS
) {
430 BUF_INFO(PERF_PET_ERROR
, ERR_THREAD
, kr
);
434 for (unsigned int i
= 0; i
< kppet
.g_nthreads
; i
++) {
435 thread_t thread
= kppet
.g_threads
[i
];
436 assert(thread
!= THREAD_NULL
);
438 kppet_sample_thread(pid
, task
, thread
, idle_rate
);
440 thread_deallocate(kppet
.g_threads
[i
]);
444 task_resume_internal(task
);
446 BUF_VERB(PERF_PET_SAMPLE_TASK
| DBG_FUNC_END
, kppet
.g_nthreads
);
450 * Store and reference all tasks on the system, so they can be safely inspected
451 * outside the `tasks_threads_lock`.
454 kppet_tasks_prepare(void)
456 kppet_lock_assert_owned();
458 vm_size_t size_needed
= 0;
461 lck_mtx_lock(&tasks_threads_lock
);
464 * With the lock held, break out of the lock/unlock loop if
465 * there's enough space to store all the tasks.
467 size_needed
= tasks_count
* sizeof(task_t
);
468 if (size_needed
<= kppet
.g_tasks_size
) {
473 * Otherwise, allocate more memory outside of the lock.
475 lck_mtx_unlock(&tasks_threads_lock
);
477 if (size_needed
> kppet
.g_tasks_size
) {
478 if (kppet
.g_tasks_size
!= 0) {
479 kfree(kppet
.g_tasks
, kppet
.g_tasks_size
);
482 assert(size_needed
> 0);
483 kppet
.g_tasks_size
= size_needed
;
485 kppet
.g_tasks
= kalloc_tag(kppet
.g_tasks_size
, VM_KERN_MEMORY_DIAG
);
486 if (!kppet
.g_tasks
) {
487 kppet
.g_tasks_size
= 0;
488 return KERN_RESOURCE_SHORTAGE
;
493 task_t task
= TASK_NULL
;
495 queue_iterate(&tasks
, task
, task_t
, tasks
) {
496 bool eligible_task
= task
!= kernel_task
;
498 task_reference_internal(task
);
499 kppet
.g_tasks
[kppet
.g_ntasks
++] = task
;
503 lck_mtx_unlock(&tasks_threads_lock
);
509 kppet_sample_tasks(uint32_t idle_rate
)
511 kppet_lock_assert_owned();
512 assert(kppet
.g_actionid
> 0);
514 uint64_t start_abs
= mach_absolute_time();
516 BUF_INFO(PERF_PET_SAMPLE
| DBG_FUNC_START
);
518 kern_return_t kr
= kppet_tasks_prepare();
519 if (kr
!= KERN_SUCCESS
) {
520 BUF_INFO(PERF_PET_ERROR
, ERR_TASK
, kr
);
521 BUF_INFO(PERF_PET_SAMPLE
| DBG_FUNC_END
);
522 return mach_absolute_time() - start_abs
;
525 for (unsigned int i
= 0; i
< kppet
.g_ntasks
; i
++) {
526 task_t task
= kppet
.g_tasks
[i
];
527 assert(task
!= TASK_NULL
);
528 kppet_sample_task(task
, idle_rate
);
529 task_deallocate(task
);
530 kppet
.g_tasks
[i
] = TASK_NULL
;
533 BUF_INFO(PERF_PET_SAMPLE
| DBG_FUNC_END
, kppet
.g_ntasks
);
535 return mach_absolute_time() - start_abs
;
538 #pragma mark - sysctl accessors
541 kppet_get_idle_rate(void)
543 return kppet
.g_idle_rate
;
547 kppet_set_idle_rate(int new_idle_rate
)
549 kppet
.g_idle_rate
= new_idle_rate
;
554 kppet_lightweight_active_update(void)
556 kppet_lightweight_active
= (kperf_is_sampling() && kppet
.g_lightweight
);
557 kperf_on_cpu_update();
561 kppet_get_lightweight_pet(void)
563 return kppet
.g_lightweight
;
567 kppet_set_lightweight_pet(int on
)
569 if (kperf_is_sampling()) {
573 kppet
.g_lightweight
= (on
== 1);
574 kppet_lightweight_active_update();