osfmk/kperf/pet.c

   1 /*
   2  * Copyright (c) 2011-2018 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * Profile Every Thread (PET) provides a profile of all threads on the system
  31  * when a timer fires.  PET supports the "record waiting threads" mode in
  32  * Instruments, and used to be called All Thread States (ATS).  New tools should
  33  * adopt the lightweight PET mode, which provides the same information, but with
  34  * much less overhead.
  35  *
  36  * When traditional (non-lightweight) PET is active, a migrating timer call
  37  * causes the PET thread to wake up.  The timer handler also issues a broadcast
  38  * IPI to the other CPUs, to provide a (somewhat) synchronized set of on-core
  39  * samples.  This is provided for backwards-compatibility with clients that
  40  * expect on-core samples, when PET's timer was based off the on-core timers.
  41  * Because PET sampling can take on the order of milliseconds, the PET thread
  42  * will enter a new timer deadline after it finished sampling This perturbs the
  43  * timer cadence by the duration of PET sampling, but it leaves the system to
  44  * work on non-profiling tasks for the duration of the timer period.
  45  *
  46  * Lightweight PET samples the system less-intrusively than normal PET
  47  * mode.  Instead of iterating tasks and threads on each sample, it increments
  48  * a global generation count, `kppet_gencount`, which is checked as threads are
  49  * context switched on-core.  If the thread's local generation count is older
  50  * than the global generation, the thread samples itself.
  51  *
  52  *            |  |
  53  * thread A   +--+---------|
  54  *            |  |
  55  * thread B   |--+---------------|
  56  *            |  |
  57  * thread C   |  |         |-------------------------------------
  58  *            |  |         |
  59  * thread D   |  |         |     |-------------------------------
  60  *            |  |         |     |
  61  *            +--+---------+-----+--------------------------------> time
  62  *               |         │     |
  63  *               |         +-----+--- threads sampled when they come on-core in
  64  *               |                    kperf_pet_switch_context
  65  *               |
  66  *               +--- PET timer fire, sample on-core threads A and B,
  67  *                    increment kppet_gencount
  68  */
  69
  70 #include <mach/mach_types.h>
  71 #include <sys/errno.h>
  72
  73 #include <kperf/kperf.h>
  74 #include <kperf/buffer.h>
  75 #include <kperf/sample.h>
  76 #include <kperf/context.h>
  77 #include <kperf/action.h>
  78 #include <kperf/pet.h>
  79 #include <kperf/kptimer.h>
  80
  81 #include <kern/task.h>
  82 #include <kern/kalloc.h>
  83 #if defined(__x86_64__)
  84 #include <i386/mp.h>
  85 #endif /* defined(__x86_64__) */
  86
  87 static LCK_MTX_DECLARE(kppet_mtx, &kperf_lck_grp);
  88
  89 static struct {
  90         unsigned int g_actionid;
  91         /*
  92          * The idle rate controls how many sampling periods to skip if a thread
  93          * is idle.
  94          */
  95         uint32_t g_idle_rate;
  96         bool g_setup:1;
  97         bool g_lightweight:1;
  98         struct kperf_sample *g_sample;
  99
 100         thread_t g_sample_thread;
 101
 102         /*
 103          * Used by the PET thread to manage which threads and tasks to sample.
 104          */
 105         thread_t *g_threads;
 106         unsigned int g_nthreads;
 107         size_t g_threads_size;
 108
 109         task_t *g_tasks;
 110         unsigned int g_ntasks;
 111         size_t g_tasks_size;
 112 } kppet = {
 113         .g_actionid = 0,
 114         .g_idle_rate = KPERF_PET_DEFAULT_IDLE_RATE,
 115 };
 116
 117 bool kppet_lightweight_active = false;
 118 _Atomic uint32_t kppet_gencount = 0;
 119
 120 static uint64_t kppet_sample_tasks(uint32_t idle_rate);
 121 static void kppet_thread(void * param, wait_result_t wr);
 122
 123 static void
 124 kppet_lock_assert_owned(void)
 125 {
 126         lck_mtx_assert(&kppet_mtx, LCK_MTX_ASSERT_OWNED);
 127 }
 128
 129 static void
 130 kppet_lock(void)
 131 {
 132         lck_mtx_lock(&kppet_mtx);
 133 }
 134
 135 static void
 136 kppet_unlock(void)
 137 {
 138         lck_mtx_unlock(&kppet_mtx);
 139 }
 140
 141 void
 142 kppet_on_cpu(thread_t thread, thread_continue_t continuation,
 143     uintptr_t *starting_fp)
 144 {
 145         assert(thread != NULL);
 146         assert(ml_get_interrupts_enabled() == FALSE);
 147
 148         uint32_t actionid = kppet.g_actionid;
 149         if (actionid == 0) {
 150                 return;
 151         }
 152
 153         if (thread->kperf_pet_gen != atomic_load(&kppet_gencount)) {
 154                 BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START,
 155                     atomic_load_explicit(&kppet_gencount,
 156                     memory_order_relaxed), thread->kperf_pet_gen);
 157
 158                 task_t task = get_threadtask(thread);
 159                 struct kperf_context ctx = {
 160                         .cur_thread = thread,
 161                         .cur_task = task,
 162                         .cur_pid = task_pid(task),
 163                         .starting_fp = starting_fp,
 164                 };
 165                 /*
 166                  * Use a per-CPU interrupt buffer, since this is only called
 167                  * while interrupts are disabled, from the scheduler.
 168                  */
 169                 struct kperf_sample *sample = kperf_intr_sample_buffer();
 170                 if (!sample) {
 171                         BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END, 1);
 172                         return;
 173                 }
 174
 175                 unsigned int flags = SAMPLE_FLAG_NON_INTERRUPT | SAMPLE_FLAG_PEND_USER;
 176                 if (continuation != NULL) {
 177                         flags |= SAMPLE_FLAG_CONTINUATION;
 178                 }
 179                 kperf_sample(sample, &ctx, actionid, flags);
 180
 181                 BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END);
 182         } else {
 183                 BUF_VERB(PERF_PET_SAMPLE_THREAD,
 184                     os_atomic_load(&kppet_gencount, relaxed), thread->kperf_pet_gen);
 185         }
 186 }
 187
 188 #pragma mark - state transitions
 189
 190 /*
 191  * Lazily initialize PET.  The PET thread never exits once PET has been used
 192  * once.
 193  */
 194 static void
 195 kppet_setup(void)
 196 {
 197         if (kppet.g_setup) {
 198                 return;
 199         }
 200
 201         kern_return_t kr = kernel_thread_start(kppet_thread, NULL,
 202             &kppet.g_sample_thread);
 203         if (kr != KERN_SUCCESS) {
 204                 panic("kperf: failed to create PET thread %d", kr);
 205         }
 206
 207         thread_set_thread_name(kppet.g_sample_thread, "kperf-pet-sampling");
 208         kppet.g_setup = true;
 209 }
 210
 211 void
 212 kppet_config(unsigned int actionid)
 213 {
 214         /*
 215          * Resetting kperf shouldn't get the PET thread started.
 216          */
 217         if (actionid == 0 && !kppet.g_setup) {
 218                 return;
 219         }
 220
 221         kppet_setup();
 222
 223         kppet_lock();
 224
 225         kppet.g_actionid = actionid;
 226
 227         if (actionid > 0) {
 228                 if (!kppet.g_sample) {
 229                         kppet.g_sample = kalloc_tag(sizeof(*kppet.g_sample),
 230                             VM_KERN_MEMORY_DIAG);
 231                 }
 232         } else {
 233                 if (kppet.g_tasks) {
 234                         assert(kppet.g_tasks_size != 0);
 235                         kfree(kppet.g_tasks, kppet.g_tasks_size);
 236                         kppet.g_tasks = NULL;
 237                         kppet.g_tasks_size = 0;
 238                         kppet.g_ntasks = 0;
 239                 }
 240                 if (kppet.g_threads) {
 241                         assert(kppet.g_threads_size != 0);
 242                         kfree(kppet.g_threads, kppet.g_threads_size);
 243                         kppet.g_threads = NULL;
 244                         kppet.g_threads_size = 0;
 245                         kppet.g_nthreads = 0;
 246                 }
 247                 if (kppet.g_sample != NULL) {
 248                         kfree(kppet.g_sample, sizeof(*kppet.g_sample));
 249                         kppet.g_sample = NULL;
 250                 }
 251         }
 252
 253         kppet_unlock();
 254 }
 255
 256 void
 257 kppet_reset(void)
 258 {
 259         kppet_config(0);
 260         kppet_set_idle_rate(KPERF_PET_DEFAULT_IDLE_RATE);
 261         kppet_set_lightweight_pet(0);
 262 }
 263
 264 void
 265 kppet_wake_thread(void)
 266 {
 267         thread_wakeup(&kppet);
 268 }
 269
 270 __attribute__((noreturn))
 271 static void
 272 kppet_thread(void * __unused param, wait_result_t __unused wr)
 273 {
 274         kppet_lock();
 275
 276         for (;;) {
 277                 BUF_INFO(PERF_PET_IDLE);
 278
 279                 do {
 280                         (void)lck_mtx_sleep(&kppet_mtx, LCK_SLEEP_DEFAULT, &kppet,
 281                             THREAD_UNINT);
 282                 } while (kppet.g_actionid == 0);
 283
 284                 BUF_INFO(PERF_PET_RUN);
 285
 286                 uint64_t sampledur_abs = kppet_sample_tasks(kppet.g_idle_rate);
 287
 288                 kptimer_pet_enter(sampledur_abs);
 289         }
 290 }
 291
 292 #pragma mark - sampling
 293
 294 static void
 295 kppet_sample_thread(int pid, task_t task, thread_t thread, uint32_t idle_rate)
 296 {
 297         kppet_lock_assert_owned();
 298
 299         uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS |
 300             SAMPLE_FLAG_THREAD_ONLY;
 301
 302         BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START);
 303
 304         struct kperf_context ctx = {
 305                 .cur_thread = thread,
 306                 .cur_task = task,
 307                 .cur_pid = pid,
 308         };
 309
 310         boolean_t thread_dirty = kperf_thread_get_dirty(thread);
 311
 312         /*
 313          * Clean a dirty thread and skip callstack sample if the thread was not
 314          * dirty and thread had skipped less than `idle_rate` samples.
 315          */
 316         if (thread_dirty) {
 317                 kperf_thread_set_dirty(thread, FALSE);
 318         } else if ((thread->kperf_pet_cnt % idle_rate) != 0) {
 319                 sample_flags |= SAMPLE_FLAG_EMPTY_CALLSTACK;
 320         }
 321         thread->kperf_pet_cnt++;
 322
 323         kperf_sample(kppet.g_sample, &ctx, kppet.g_actionid, sample_flags);
 324         kperf_sample_user(&kppet.g_sample->usample, &ctx, kppet.g_actionid,
 325             sample_flags);
 326
 327         BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END);
 328 }
 329
 330 static kern_return_t
 331 kppet_threads_prepare(task_t task)
 332 {
 333         kppet_lock_assert_owned();
 334
 335         vm_size_t threads_size_needed;
 336
 337         for (;;) {
 338                 task_lock(task);
 339
 340                 if (!task->active) {
 341                         task_unlock(task);
 342                         return KERN_FAILURE;
 343                 }
 344
 345                 /*
 346                  * With the task locked, figure out if enough space has been allocated to
 347                  * contain all of the thread references.
 348                  */
 349                 threads_size_needed = task->thread_count * sizeof(thread_t);
 350                 if (threads_size_needed <= kppet.g_threads_size) {
 351                         break;
 352                 }
 353
 354                 /*
 355                  * Otherwise, allocate more and try again.
 356                  */
 357                 task_unlock(task);
 358
 359                 if (kppet.g_threads_size != 0) {
 360                         kfree(kppet.g_threads, kppet.g_threads_size);
 361                 }
 362
 363                 assert(threads_size_needed > 0);
 364                 kppet.g_threads_size = threads_size_needed;
 365
 366                 kppet.g_threads = kalloc_tag(kppet.g_threads_size, VM_KERN_MEMORY_DIAG);
 367                 if (kppet.g_threads == NULL) {
 368                         kppet.g_threads_size = 0;
 369                         return KERN_RESOURCE_SHORTAGE;
 370                 }
 371         }
 372
 373         thread_t thread;
 374         kppet.g_nthreads = 0;
 375         queue_iterate(&(task->threads), thread, thread_t, task_threads) {
 376                 thread_reference_internal(thread);
 377                 kppet.g_threads[kppet.g_nthreads++] = thread;
 378         }
 379
 380         task_unlock(task);
 381
 382         return (kppet.g_nthreads > 0) ? KERN_SUCCESS : KERN_FAILURE;
 383 }
 384
 385 /*
 386  * Sample a `task`, using `idle_rate` to control whether idle threads need to be
 387  * re-sampled.
 388  *
 389  * The task must be referenced.
 390  */
 391 static void
 392 kppet_sample_task(task_t task, uint32_t idle_rate)
 393 {
 394         kppet_lock_assert_owned();
 395         assert(task != kernel_task);
 396         if (task == kernel_task) {
 397                 return;
 398         }
 399
 400         BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_START);
 401
 402         int pid = task_pid(task);
 403         if (kperf_action_has_task(kppet.g_actionid)) {
 404                 struct kperf_context ctx = {
 405                         .cur_task = task,
 406                         .cur_pid = pid,
 407                 };
 408
 409                 kperf_sample(kppet.g_sample, &ctx, kppet.g_actionid,
 410                     SAMPLE_FLAG_TASK_ONLY);
 411         }
 412
 413         if (!kperf_action_has_thread(kppet.g_actionid)) {
 414                 BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END);
 415                 return;
 416         }
 417
 418         /*
 419          * Suspend the task to see an atomic snapshot of all its threads.  This
 420          * is expensive and disruptive.
 421          */
 422         kern_return_t kr = task_suspend_internal(task);
 423         if (kr != KERN_SUCCESS) {
 424                 BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1);
 425                 return;
 426         }
 427
 428         kr = kppet_threads_prepare(task);
 429         if (kr != KERN_SUCCESS) {
 430                 BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr);
 431                 goto out;
 432         }
 433
 434         for (unsigned int i = 0; i < kppet.g_nthreads; i++) {
 435                 thread_t thread = kppet.g_threads[i];
 436                 assert(thread != THREAD_NULL);
 437
 438                 kppet_sample_thread(pid, task, thread, idle_rate);
 439
 440                 thread_deallocate(kppet.g_threads[i]);
 441         }
 442
 443 out:
 444         task_resume_internal(task);
 445
 446         BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, kppet.g_nthreads);
 447 }
 448
 449 /*
 450  * Store and reference all tasks on the system, so they can be safely inspected
 451  * outside the `tasks_threads_lock`.
 452  */
 453 static kern_return_t
 454 kppet_tasks_prepare(void)
 455 {
 456         kppet_lock_assert_owned();
 457
 458         vm_size_t size_needed = 0;
 459
 460         for (;;) {
 461                 lck_mtx_lock(&tasks_threads_lock);
 462
 463                 /*
 464                  * With the lock held, break out of the lock/unlock loop if
 465                  * there's enough space to store all the tasks.
 466                  */
 467                 size_needed = tasks_count * sizeof(task_t);
 468                 if (size_needed <= kppet.g_tasks_size) {
 469                         break;
 470                 }
 471
 472                 /*
 473                  * Otherwise, allocate more memory outside of the lock.
 474                  */
 475                 lck_mtx_unlock(&tasks_threads_lock);
 476
 477                 if (size_needed > kppet.g_tasks_size) {
 478                         if (kppet.g_tasks_size != 0) {
 479                                 kfree(kppet.g_tasks, kppet.g_tasks_size);
 480                         }
 481
 482                         assert(size_needed > 0);
 483                         kppet.g_tasks_size = size_needed;
 484
 485                         kppet.g_tasks = kalloc_tag(kppet.g_tasks_size, VM_KERN_MEMORY_DIAG);
 486                         if (!kppet.g_tasks) {
 487                                 kppet.g_tasks_size = 0;
 488                                 return KERN_RESOURCE_SHORTAGE;
 489                         }
 490                 }
 491         }
 492
 493         task_t task = TASK_NULL;
 494         kppet.g_ntasks = 0;
 495         queue_iterate(&tasks, task, task_t, tasks) {
 496                 bool eligible_task = task != kernel_task;
 497                 if (eligible_task) {
 498                         task_reference_internal(task);
 499                         kppet.g_tasks[kppet.g_ntasks++] = task;
 500                 }
 501         }
 502
 503         lck_mtx_unlock(&tasks_threads_lock);
 504
 505         return KERN_SUCCESS;
 506 }
 507
 508 static uint64_t
 509 kppet_sample_tasks(uint32_t idle_rate)
 510 {
 511         kppet_lock_assert_owned();
 512         assert(kppet.g_actionid > 0);
 513
 514         uint64_t start_abs = mach_absolute_time();
 515
 516         BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_START);
 517
 518         kern_return_t kr = kppet_tasks_prepare();
 519         if (kr != KERN_SUCCESS) {
 520                 BUF_INFO(PERF_PET_ERROR, ERR_TASK, kr);
 521                 BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END);
 522                 return mach_absolute_time() - start_abs;
 523         }
 524
 525         for (unsigned int i = 0; i < kppet.g_ntasks; i++) {
 526                 task_t task = kppet.g_tasks[i];
 527                 assert(task != TASK_NULL);
 528                 kppet_sample_task(task, idle_rate);
 529                 task_deallocate(task);
 530                 kppet.g_tasks[i] = TASK_NULL;
 531         }
 532
 533         BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END, kppet.g_ntasks);
 534         kppet.g_ntasks = 0;
 535         return mach_absolute_time() - start_abs;
 536 }
 537
 538 #pragma mark - sysctl accessors
 539
 540 int
 541 kppet_get_idle_rate(void)
 542 {
 543         return kppet.g_idle_rate;
 544 }
 545
 546 int
 547 kppet_set_idle_rate(int new_idle_rate)
 548 {
 549         kppet.g_idle_rate = new_idle_rate;
 550         return 0;
 551 }
 552
 553 void
 554 kppet_lightweight_active_update(void)
 555 {
 556         kppet_lightweight_active = (kperf_is_sampling() && kppet.g_lightweight);
 557         kperf_on_cpu_update();
 558 }
 559
 560 int
 561 kppet_get_lightweight_pet(void)
 562 {
 563         return kppet.g_lightweight;
 564 }
 565
 566 int
 567 kppet_set_lightweight_pet(int on)
 568 {
 569         if (kperf_is_sampling()) {
 570                 return EBUSY;
 571         }
 572
 573         kppet.g_lightweight = (on == 1);
 574         kppet_lightweight_active_update();
 575         return 0;
 576 }