osfmk/kern/telemetry.c

   1 /*
   2  * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 #include <mach/host_priv.h>
  29 #include <mach/host_special_ports.h>
  30 #include <mach/mach_types.h>
  31 #include <mach/telemetry_notification_server.h>
  32
  33 #include <kern/assert.h>
  34 #include <kern/clock.h>
  35 #include <kern/debug.h>
  36 #include <kern/host.h>
  37 #include <kern/kalloc.h>
  38 #include <kern/kern_types.h>
  39 #include <kern/locks.h>
  40 #include <kern/misc_protos.h>
  41 #include <kern/sched.h>
  42 #include <kern/sched_prim.h>
  43 #include <kern/telemetry.h>
  44 #include <kern/timer_call.h>
  45
  46 #include <pexpert/pexpert.h>
  47
  48 #include <vm/vm_kern.h>
  49 #include <vm/vm_shared_region.h>
  50
  51 #include <kperf/kperf.h>
  52 #include <kperf/context.h>
  53 #include <kperf/callstack.h>
  54
  55 #include <sys/kdebug.h>
  56 #include <uuid/uuid.h>
  57 #include <kdp/kdp_dyld.h>
  58
  59 #define TELEMETRY_DEBUG 0
  60
  61 extern int      proc_pid(void *);
  62 extern char     *proc_name_address(void *p);
  63 extern uint64_t proc_uniqueid(void *p);
  64 extern uint64_t proc_was_throttled(void *p);
  65 extern uint64_t proc_did_throttle(void *p);
  66 extern uint64_t get_dispatchqueue_serialno_offset_from_proc(void *p);
  67 extern int      proc_selfpid(void);
  68
  69 struct micro_snapshot_buffer {
  70         vm_offset_t             buffer;
  71         uint32_t                size;
  72         uint32_t                current_position;
  73         uint32_t                end_point;
  74 };
  75
  76 void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro_snapshot_buffer * current_buffer);
  77 int telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark, struct micro_snapshot_buffer * current_buffer);
  78
  79 #define TELEMETRY_DEFAULT_SAMPLE_RATE (1) /* 1 sample every 1 second */
  80 #define TELEMETRY_DEFAULT_WINDOW_BUFFER_SIZE (512*1024) /* Should hopefully provide 10 seconds worth of samples */
  81 #define TELEMETRY_DEFAULT_BUFFER_SIZE (16*1024)
  82 #define TELEMETRY_MAX_BUFFER_SIZE (64*1024)
  83
  84 #define TELEMETRY_DEFAULT_NOTIFY_LEEWAY (4*1024) // Userland gets 4k of leeway to collect data after notification
  85 #define TELEMETRY_MAX_UUID_COUNT (128) // Max of 128 non-shared-cache UUIDs to log for symbolication
  86
  87 uint32_t                        telemetry_sample_rate = 0;
  88 volatile boolean_t      telemetry_needs_record = FALSE;
  89 volatile boolean_t      telemetry_windowed_record = FALSE;
  90 volatile boolean_t      telemetry_needs_timer_arming_record = FALSE;
  91
  92 /*
  93  * Tells the scheduler that we want it to invoke
  94  * compute_telemetry_windowed(); it is still our responsibility
  95  * to ensure that we do not panic if someone disables the window
  96  * buffer immediately after the scheduler does so.
  97  */
  98 volatile boolean_t      telemetry_window_enabled = FALSE;
  99
 100 /*
 101  * If TRUE, record micro-stackshot samples for all tasks.
 102  * If FALSE, only sample tasks which are marked for telemetry.
 103  */
 104 boolean_t                       telemetry_sample_all_tasks = FALSE;
 105 uint32_t                        telemetry_active_tasks = 0; // Number of tasks opted into telemetry
 106
 107 uint32_t                        telemetry_timestamp = 0;
 108
 109 /*
 110  * We have two buffers.  The telemetry_buffer is responsible
 111  * for timer samples and interrupt samples that are driven by
 112  * compute_averages().  It will notify its client (if one
 113  * exists) when it has enough data to be worth flushing.
 114  *
 115  * The window_buffer contains only interrupt_samples that are
 116  * driven by the scheduler.  Its intent is to provide a
 117  * window of recent activity on the cpu(s).
 118  */
 119 struct micro_snapshot_buffer telemetry_buffer = {0, 0, 0, 0};
 120 struct micro_snapshot_buffer window_buffer = {0, 0, 0, 0};
 121
 122 int                                     telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked?
 123 int                                     telemetry_buffer_notify_at = 0;
 124
 125 lck_grp_t               telemetry_lck_grp;
 126 lck_mtx_t               telemetry_mtx;
 127
 128 #define TELEMETRY_LOCK() do { lck_mtx_lock(&telemetry_mtx); } while(0)
 129 #define TELEMETRY_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&telemetry_mtx)
 130 #define TELEMETRY_UNLOCK() do { lck_mtx_unlock(&telemetry_mtx); } while(0)
 131
 132 void telemetry_init(void)
 133 {
 134         kern_return_t ret;
 135         uint32_t          telemetry_notification_leeway;
 136
 137         lck_grp_init(&telemetry_lck_grp, "telemetry group", LCK_GRP_ATTR_NULL);
 138         lck_mtx_init(&telemetry_mtx, &telemetry_lck_grp, LCK_ATTR_NULL);
 139
 140         if (!PE_parse_boot_argn("telemetry_buffer_size", &telemetry_buffer.size, sizeof(telemetry_buffer.size))) {
 141                 telemetry_buffer.size = TELEMETRY_DEFAULT_BUFFER_SIZE;
 142         }
 143
 144         if (telemetry_buffer.size > TELEMETRY_MAX_BUFFER_SIZE)
 145                 telemetry_buffer.size = TELEMETRY_MAX_BUFFER_SIZE;
 146
 147         ret = kmem_alloc(kernel_map, &telemetry_buffer.buffer, telemetry_buffer.size);
 148         if (ret != KERN_SUCCESS) {
 149                 kprintf("Telemetry: Allocation failed: %d\n", ret);
 150                 return;
 151         }
 152         bzero((void *) telemetry_buffer.buffer, telemetry_buffer.size);
 153
 154         if (!PE_parse_boot_argn("telemetry_notification_leeway", &telemetry_notification_leeway, sizeof(telemetry_notification_leeway))) {
 155                 /*
 156                  * By default, notify the user to collect the buffer when there is this much space left in the buffer.
 157                  */
 158                 telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY;
 159         }
 160         if (telemetry_notification_leeway >= telemetry_buffer.size) {
 161                 printf("telemetry: nonsensical telemetry_notification_leeway boot-arg %d changed to %d\n",
 162                        telemetry_notification_leeway, TELEMETRY_DEFAULT_NOTIFY_LEEWAY);
 163                 telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY;
 164         }
 165         telemetry_buffer_notify_at = telemetry_buffer.size - telemetry_notification_leeway;
 166
 167         if (!PE_parse_boot_argn("telemetry_sample_rate", &telemetry_sample_rate, sizeof(telemetry_sample_rate))) {
 168                 telemetry_sample_rate = TELEMETRY_DEFAULT_SAMPLE_RATE;
 169         }
 170
 171         /*
 172          * To enable telemetry for all tasks, include "telemetry_sample_all_tasks=1" in boot-args.
 173          */
 174         if (!PE_parse_boot_argn("telemetry_sample_all_tasks", &telemetry_sample_all_tasks, sizeof(telemetry_sample_all_tasks))) {
 175
 176                 telemetry_sample_all_tasks = TRUE;
 177
 178         }
 179
 180         kprintf("Telemetry: Sampling %stasks once per %u second%s\n",
 181                 (telemetry_sample_all_tasks) ? "all " : "",
 182                 telemetry_sample_rate, telemetry_sample_rate == 1 ? "" : "s");
 183 }
 184
 185 /*
 186  * Enable or disable global microstackshots (ie telemetry_sample_all_tasks).
 187  *
 188  * enable_disable == 1: turn it on
 189  * enable_disable == 0: turn it off
 190  */
 191 void
 192 telemetry_global_ctl(int enable_disable)
 193 {
 194         if (enable_disable == 1) {
 195                 telemetry_sample_all_tasks = TRUE;
 196         } else {
 197                 telemetry_sample_all_tasks = FALSE;
 198         }
 199 }
 200
 201 /*
 202  * Opt the given task into or out of the telemetry stream.
 203  *
 204  * Supported reasons (callers may use any or all of):
 205  *     TF_CPUMON_WARNING
 206  *     TF_WAKEMON_WARNING
 207  *
 208  * enable_disable == 1: turn it on
 209  * enable_disable == 0: turn it off
 210  */
 211 void
 212 telemetry_task_ctl(task_t task, uint32_t reasons, int enable_disable)
 213 {
 214         task_lock(task);
 215         telemetry_task_ctl_locked(task, reasons, enable_disable);
 216         task_unlock(task);
 217 }
 218
 219 void
 220 telemetry_task_ctl_locked(task_t task, uint32_t reasons, int enable_disable)
 221 {
 222         uint32_t origflags;
 223
 224         assert((reasons != 0) && ((reasons | TF_TELEMETRY) == TF_TELEMETRY));
 225
 226         task_lock_assert_owned(task);
 227
 228         origflags = task->t_flags;
 229
 230         if (enable_disable == 1) {
 231                 task->t_flags |= reasons;
 232                 if ((origflags & TF_TELEMETRY) == 0) {
 233                         OSIncrementAtomic(&telemetry_active_tasks);
 234 #if TELEMETRY_DEBUG
 235                         printf("%s: telemetry OFF -> ON (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks);
 236 #endif
 237                 }
 238         } else {
 239                 task->t_flags &= ~reasons;
 240                 if (((origflags & TF_TELEMETRY) != 0) && ((task->t_flags & TF_TELEMETRY) == 0)) {
 241                         /*
 242                          * If this task went from having at least one telemetry bit to having none,
 243                          * the net change was to disable telemetry for the task.
 244                          */
 245                         OSDecrementAtomic(&telemetry_active_tasks);
 246 #if TELEMETRY_DEBUG
 247                         printf("%s: telemetry ON -> OFF (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks);
 248 #endif
 249                 }
 250         }
 251 }
 252
 253 /*
 254  * Enable the window_buffer, and do any associated setup.
 255  */
 256 kern_return_t
 257 telemetry_enable_window(void)
 258 {
 259         kern_return_t ret = KERN_SUCCESS;
 260         vm_offset_t kern_buffer = 0;
 261         vm_size_t kern_buffer_size = TELEMETRY_DEFAULT_WINDOW_BUFFER_SIZE;
 262
 263         /*
 264          * We have no guarantee we won't allocate the buffer, take
 265          * the lock, and then discover someone beat us to the punch,
 266          * but we would prefer to avoid blocking while holding the
 267          * lock.
 268          */
 269         ret = kmem_alloc(kernel_map, &kern_buffer, kern_buffer_size);
 270
 271         TELEMETRY_LOCK();
 272
 273         if (!window_buffer.buffer) {
 274                 if (ret == KERN_SUCCESS) {
 275                         /* No existing buffer was found, so... */
 276                         window_buffer.end_point = 0;
 277                         window_buffer.current_position = 0;
 278
 279                         /* Hand off the buffer, and... */
 280                         window_buffer.size = (uint32_t) kern_buffer_size;
 281                         window_buffer.buffer = kern_buffer;
 282                         kern_buffer = 0;
 283                         kern_buffer_size = 0;
 284                         bzero((void *) window_buffer.buffer, window_buffer.size);
 285
 286                         /* Let the scheduler know it should drive windowed samples */
 287                         telemetry_window_enabled = TRUE;
 288                 }
 289         } else {
 290                 /* We already have a buffer, so we have "succeeded" */
 291                 ret = KERN_SUCCESS;
 292         }
 293
 294         TELEMETRY_UNLOCK();
 295
 296         if (kern_buffer)
 297                 kmem_free(kernel_map, kern_buffer, kern_buffer_size);
 298
 299         return ret;
 300 }
 301
 302 /*
 303  * Disable the window_buffer, and do any associated teardown.
 304  */
 305 void
 306 telemetry_disable_window(void)
 307 {
 308         vm_offset_t kern_buffer = 0;
 309         vm_size_t kern_buffer_size = 0;
 310
 311         TELEMETRY_LOCK();
 312
 313         if (window_buffer.buffer) {
 314                 /* We have a window buffer, so tear it down */
 315                 telemetry_window_enabled = FALSE;
 316                 kern_buffer = window_buffer.buffer;
 317                 kern_buffer_size = window_buffer.size;
 318                 window_buffer.buffer = 0;
 319                 window_buffer.size = 0;
 320                 window_buffer.current_position = 0;
 321                 window_buffer.end_point = 0;
 322         }
 323
 324         TELEMETRY_UNLOCK();
 325
 326         if (kern_buffer)
 327                 kmem_free(kernel_map, kern_buffer, kern_buffer_size);
 328 }
 329
 330 /*
 331  * Determine if the current thread is eligible for telemetry:
 332  *
 333  * telemetry_sample_all_tasks: All threads are eligible. This takes precedence.
 334  * telemetry_active_tasks: Count of tasks opted in.
 335  * task->t_flags & TF_TELEMETRY: This task is opted in.
 336  */
 337 static boolean_t
 338 telemetry_is_active(thread_t thread)
 339 {
 340         if (telemetry_sample_all_tasks == TRUE) {
 341                 return (TRUE);
 342         }
 343
 344         if ((telemetry_active_tasks > 0) && ((thread->task->t_flags & TF_TELEMETRY) != 0)) {
 345                 return (TRUE);
 346         }
 347
 348         return (FALSE);
 349 }
 350
 351 /*
 352  * Userland is arming a timer. If we are eligible for such a record,
 353  * sample now. No need to do this one at the AST because we're already at
 354  * a safe place in this system call.
 355  */
 356 int telemetry_timer_event(__unused uint64_t deadline, __unused uint64_t interval, __unused uint64_t leeway)
 357 {
 358         if (telemetry_needs_timer_arming_record == TRUE) {
 359                 telemetry_needs_timer_arming_record = FALSE;
 360                 telemetry_take_sample(current_thread(), kTimerArmingRecord | kUserMode, &telemetry_buffer);
 361         }
 362
 363         return (0);
 364 }
 365
 366 /*
 367  * Mark the current thread for an interrupt-based
 368  * telemetry record, to be sampled at the next AST boundary.
 369  */
 370 void telemetry_mark_curthread(boolean_t interrupted_userspace)
 371 {
 372         uint32_t ast_bits = 0;
 373         thread_t thread = current_thread();
 374
 375         /*
 376          * If telemetry isn't active for this thread, return and try
 377          * again next time.
 378          */
 379         if (telemetry_is_active(thread) == FALSE) {
 380                 return;
 381         }
 382
 383         ast_bits |= (interrupted_userspace ? AST_TELEMETRY_USER : AST_TELEMETRY_KERNEL);
 384
 385         if (telemetry_windowed_record) {
 386                 ast_bits |= AST_TELEMETRY_WINDOWED;
 387         }
 388
 389         telemetry_windowed_record = FALSE;
 390         telemetry_needs_record = FALSE;
 391         thread_ast_set(thread, ast_bits);
 392         ast_propagate(thread->ast);
 393 }
 394
 395 void compute_telemetry(void *arg __unused)
 396 {
 397         if (telemetry_sample_all_tasks || (telemetry_active_tasks > 0)) {
 398                 if ((++telemetry_timestamp) % telemetry_sample_rate == 0) {
 399                         telemetry_needs_record = TRUE;
 400                         telemetry_needs_timer_arming_record = TRUE;
 401                 }
 402         }
 403 }
 404
 405 void compute_telemetry_windowed(void)
 406 {
 407         if (telemetry_sample_all_tasks || (telemetry_active_tasks > 0)) {
 408                 /*
 409                  * Due to the relationship between the two fields here,
 410                  * a request for a windowed record will "squash" a
 411                  * request for a regular interrupt record.  We hedge
 412                  * against this by doing a quick check for an existing
 413                  * request.  compute_telemetry doesn't hedge because
 414                  * a regular request cannot squash a windowed request
 415                  * (due to the implementation).
 416                  *
 417                  * If we really want to do this properly, we could make
 418                  * telemetry_needs_record a bitfield, and process one
 419                  * request per telemetry_mark_curthread... but that
 420                  * would be more expensive (atomics).  This should be
 421                  * robust enough for now (although it biases in favor
 422                  * of the regular records).
 423                  */
 424                 if (!telemetry_needs_record) {
 425                         telemetry_needs_record = TRUE;
 426                         telemetry_windowed_record = TRUE;
 427                 }
 428         }
 429 }
 430
 431 /*
 432  * If userland has registered a port for telemetry notifications, send one now.
 433  */
 434 static void
 435 telemetry_notify_user(void)
 436 {
 437         mach_port_t user_port;
 438         uint32_t        flags = 0;
 439         int                     error;
 440
 441         error = host_get_telemetry_port(host_priv_self(), &user_port);
 442         if ((error != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) {
 443                 return;
 444         }
 445
 446         telemetry_notification(user_port, flags);
 447 }
 448
 449 void telemetry_ast(thread_t thread, boolean_t interrupted_userspace, boolean_t is_windowed)
 450 {
 451         uint8_t microsnapshot_flags = kInterruptRecord;
 452
 453         if (interrupted_userspace)
 454                 microsnapshot_flags |= kUserMode;
 455
 456         if (is_windowed) {
 457                 telemetry_take_sample(thread, microsnapshot_flags, &window_buffer);
 458         } else {
 459                 telemetry_take_sample(thread, microsnapshot_flags, &telemetry_buffer);
 460         }
 461 }
 462
 463 void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro_snapshot_buffer * current_buffer)
 464 {
 465         task_t task;
 466         void *p;
 467         struct kperf_context ctx;
 468         struct callstack cs;
 469         uint32_t btcount, bti;
 470         struct micro_snapshot *msnap;
 471         struct task_snapshot *tsnap;
 472         struct thread_snapshot *thsnap;
 473         clock_sec_t secs;
 474         clock_usec_t usecs;
 475         vm_size_t framesize;
 476         uint32_t current_record_start;
 477         uint32_t tmp = 0;
 478         boolean_t notify = FALSE;
 479
 480         if (thread == THREAD_NULL)
 481                 return;
 482
 483         task = thread->task;
 484         if ((task == TASK_NULL) || (task == kernel_task))
 485                 return;
 486
 487         /*
 488          * To avoid overloading the system with telemetry requests, make
 489          * sure we don't add more requests while existing ones are
 490          * in-flight.  Attempt this by checking if we can grab the lock.
 491          *
 492          * This concerns me a little; this working as intended is
 493          * contingent on the workload being done in the context of the
 494          * telemetry lock being the expensive part of telemetry.  This
 495          * includes populating the buffer and the client gathering it,
 496          * but excludes the copyin overhead.
 497          */
 498         if (!TELEMETRY_TRY_SPIN_LOCK())
 499                 return;
 500
 501         TELEMETRY_UNLOCK();
 502
 503         /* telemetry_XXX accessed outside of lock for instrumentation only */
 504         /* TODO */
 505         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_START, microsnapshot_flags, telemetry_bytes_since_last_mark, 0, 0, (&telemetry_buffer != current_buffer));
 506
 507         p = get_bsdtask_info(task);
 508
 509         ctx.cur_thread = thread;
 510         ctx.cur_pid = proc_pid(p);
 511
 512         /*
 513          * Gather up the data we'll need for this sample. The sample is written into the kernel
 514          * buffer with the global telemetry lock held -- so we must do our (possibly faulting)
 515          * copies from userland here, before taking the lock.
 516          */
 517         kperf_ucallstack_sample(&cs, &ctx);
 518         if (!(cs.flags & CALLSTACK_VALID))
 519                 return;
 520
 521         /*
 522          * Find the actual [slid] address of the shared cache's UUID, and copy it in from userland.
 523          */
 524         int                                                     shared_cache_uuid_valid = 0;
 525         uint64_t                                        shared_cache_base_address;
 526         struct _dyld_cache_header       shared_cache_header;
 527         uint64_t                                        shared_cache_slide;
 528
 529         /*
 530          * Don't copy in the entire shared cache header; we only need the UUID. Calculate the
 531          * offset of that one field.
 532          */
 533         int sc_header_uuid_offset = (char *)&shared_cache_header.uuid - (char *)&shared_cache_header;
 534         vm_shared_region_t sr = vm_shared_region_get(task);
 535         if (sr != NULL) {
 536                 if ((vm_shared_region_start_address(sr, &shared_cache_base_address) == KERN_SUCCESS) &&
 537                         (copyin(shared_cache_base_address + sc_header_uuid_offset, (char *)&shared_cache_header.uuid,
 538                     sizeof (shared_cache_header.uuid)) == 0)) {
 539                         shared_cache_uuid_valid = 1;
 540                         shared_cache_slide = vm_shared_region_get_slide(sr);
 541                 }
 542                 // vm_shared_region_get() gave us a reference on the shared region.
 543                 vm_shared_region_deallocate(sr);
 544         }
 545
 546         /*
 547          * Retrieve the array of UUID's for binaries used by this task.
 548          * We reach down into DYLD's data structures to find the array.
 549          *
 550          * XXX - make this common with kdp?
 551          */
 552         uint32_t                        uuid_info_count = 0;
 553         mach_vm_address_t       uuid_info_addr = 0;
 554         if (task_has_64BitAddr(task)) {
 555                 struct user64_dyld_all_image_infos task_image_infos;
 556                 if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
 557                         uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
 558                         uuid_info_addr = task_image_infos.uuidArray;
 559                 }
 560         } else {
 561                 struct user32_dyld_all_image_infos task_image_infos;
 562                 if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
 563                         uuid_info_count = task_image_infos.uuidArrayCount;
 564                         uuid_info_addr = task_image_infos.uuidArray;
 565                 }
 566         }
 567
 568         /*
 569          * If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating
 570          * this data structure), we zero the uuid_info_count so that we won't even try to save load info
 571          * for this task.
 572          */
 573         if (!uuid_info_addr) {
 574                 uuid_info_count = 0;
 575         }
 576
 577         /*
 578          * Don't copy in an unbounded amount of memory. The main binary and interesting
 579          * non-shared-cache libraries should be in the first few images.
 580          */
 581         if (uuid_info_count > TELEMETRY_MAX_UUID_COUNT) {
 582                 uuid_info_count = TELEMETRY_MAX_UUID_COUNT;
 583         }
 584
 585         uint32_t uuid_info_size = (uint32_t)(task_has_64BitAddr(thread->task) ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
 586         uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size;
 587         char     *uuid_info_array = NULL;
 588
 589         if (uuid_info_count > 0) {
 590                 if ((uuid_info_array = (char *)kalloc(uuid_info_array_size)) == NULL) {
 591                         return;
 592                 }
 593
 594                 /*
 595                  * Copy in the UUID info array.
 596                  * It may be nonresident, in which case just fix up nloadinfos to 0 in the task snapshot.
 597                  */
 598                 if (copyin(uuid_info_addr, uuid_info_array, uuid_info_array_size) != 0) {
 599                         kfree(uuid_info_array, uuid_info_array_size);
 600                         uuid_info_array = NULL;
 601                         uuid_info_array_size = 0;
 602                 }
 603         }
 604
 605         /*
 606          * Look for a dispatch queue serial number, and copy it in from userland if present.
 607          */
 608         uint64_t dqserialnum = 0;
 609         int              dqserialnum_valid = 0;
 610
 611         uint64_t dqkeyaddr = thread_dispatchqaddr(thread);
 612         if (dqkeyaddr != 0) {
 613                 uint64_t dqaddr = 0;
 614                 uint64_t dq_serialno_offset = get_dispatchqueue_serialno_offset_from_proc(task->bsd_info);
 615                 if ((copyin(dqkeyaddr, (char *)&dqaddr, (task_has_64BitAddr(task) ? 8 : 4)) == 0) &&
 616                     (dqaddr != 0) && (dq_serialno_offset != 0)) {
 617                         uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset;
 618                         if (copyin(dqserialnumaddr, (char *)&dqserialnum, (task_has_64BitAddr(task) ? 8 : 4)) == 0) {
 619                                 dqserialnum_valid = 1;
 620                         }
 621                 }
 622         }
 623
 624         clock_get_calendar_microtime(&secs, &usecs);
 625
 626         TELEMETRY_LOCK();
 627
 628         /*
 629          * For the benefit of the window buffer; if our buffer is not backed by anything,
 630          * then we cannot take the sample.  Meant to allow us to deallocate the window
 631          * buffer if it is disabled.
 632          */
 633         if (!current_buffer->buffer)
 634                 goto cancel_sample;
 635
 636         /*
 637          * We do the bulk of the operation under the telemetry lock, on assumption that
 638          * any page faults during execution will not cause another AST_TELEMETRY_ALL
 639          * to deadlock; they will just block until we finish. This makes it easier
 640          * to copy into the buffer directly. As soon as we unlock, userspace can copy
 641          * out of our buffer.
 642          */
 643
 644 copytobuffer:
 645
 646         current_record_start = current_buffer->current_position;
 647
 648         if ((current_buffer->size - current_buffer->current_position) < sizeof(struct micro_snapshot)) {
 649                 /*
 650                  * We can't fit a record in the space available, so wrap around to the beginning.
 651                  * Save the current position as the known end point of valid data.
 652                  */
 653                 current_buffer->end_point = current_record_start;
 654                 current_buffer->current_position = 0;
 655                 if (current_record_start == 0) {
 656                         /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 657                         goto cancel_sample;
 658                 }
 659                 goto copytobuffer;
 660         }
 661
 662         msnap = (struct micro_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
 663         msnap->snapshot_magic = STACKSHOT_MICRO_SNAPSHOT_MAGIC;
 664         msnap->ms_flags = microsnapshot_flags;
 665         msnap->ms_opaque_flags = 0; /* namespace managed by userspace */
 666         msnap->ms_cpu = 0; /* XXX - does this field make sense for a micro-stackshot? */
 667         msnap->ms_time = secs;
 668         msnap->ms_time_microsecs = usecs;
 669
 670         current_buffer->current_position += sizeof(struct micro_snapshot);
 671
 672         if ((current_buffer->size - current_buffer->current_position) < sizeof(struct task_snapshot)) {
 673                 current_buffer->end_point = current_record_start;
 674                 current_buffer->current_position = 0;
 675                 if (current_record_start == 0) {
 676                         /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 677                         goto cancel_sample;
 678                 }
 679                 goto copytobuffer;
 680         }
 681
 682         tsnap = (struct task_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
 683         bzero(tsnap, sizeof(*tsnap));
 684         tsnap->snapshot_magic = STACKSHOT_TASK_SNAPSHOT_MAGIC;
 685         tsnap->pid = proc_pid(p);
 686         tsnap->uniqueid = proc_uniqueid(p);
 687         tsnap->user_time_in_terminated_threads = task->total_user_time;
 688         tsnap->system_time_in_terminated_threads = task->total_system_time;
 689         tsnap->suspend_count = task->suspend_count;
 690         tsnap->task_size = pmap_resident_count(task->map->pmap);
 691         tsnap->faults = task->faults;
 692         tsnap->pageins = task->pageins;
 693         tsnap->cow_faults = task->cow_faults;
 694         /*
 695          * The throttling counters are maintained as 64-bit counters in the proc
 696          * structure. However, we reserve 32-bits (each) for them in the task_snapshot
 697          * struct to save space and since we do not expect them to overflow 32-bits. If we
 698          * find these values overflowing in the future, the fix would be to simply
 699          * upgrade these counters to 64-bit in the task_snapshot struct
 700          */
 701         tsnap->was_throttled = (uint32_t) proc_was_throttled(p);
 702         tsnap->did_throttle = (uint32_t) proc_did_throttle(p);
 703
 704         if (task->t_flags & TF_TELEMETRY) {
 705                 tsnap->ss_flags |= kTaskRsrcFlagged;
 706         }
 707
 708         if (task->effective_policy.darwinbg == 1) {
 709                 tsnap->ss_flags |= kTaskDarwinBG;
 710         }
 711
 712         proc_get_darwinbgstate(task, &tmp);
 713
 714         if (task->requested_policy.t_role == TASK_FOREGROUND_APPLICATION) {
 715                 tsnap->ss_flags |= kTaskIsForeground;
 716         }
 717
 718         if (tmp & PROC_FLAG_ADAPTIVE_IMPORTANT) {
 719                 tsnap->ss_flags |= kTaskIsBoosted;
 720         }
 721
 722         if (tmp & PROC_FLAG_SUPPRESSED) {
 723                 tsnap->ss_flags |= kTaskIsSuppressed;
 724         }
 725
 726         tsnap->latency_qos = task_grab_latency_qos(task);
 727
 728         strlcpy(tsnap->p_comm, proc_name_address(p), sizeof(tsnap->p_comm));
 729         if (task_has_64BitAddr(thread->task)) {
 730                 tsnap->ss_flags |= kUser64_p;
 731         }
 732
 733         if (shared_cache_uuid_valid) {
 734                 tsnap->shared_cache_slide = shared_cache_slide;
 735                 bcopy(shared_cache_header.uuid, tsnap->shared_cache_identifier, sizeof (shared_cache_header.uuid));
 736         }
 737
 738         current_buffer->current_position += sizeof(struct task_snapshot);
 739
 740         /*
 741          * Directly after the task snapshot, place the array of UUID's corresponding to the binaries
 742          * used by this task.
 743          */
 744         if ((current_buffer->size - current_buffer->current_position) < uuid_info_array_size) {
 745                 current_buffer->end_point = current_record_start;
 746                 current_buffer->current_position = 0;
 747                 if (current_record_start == 0) {
 748                         /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 749                         goto cancel_sample;
 750                 }
 751                 goto copytobuffer;
 752         }
 753
 754         /*
 755          * Copy the UUID info array into our sample.
 756          */
 757         if (uuid_info_array_size > 0) {
 758                 bcopy(uuid_info_array, (char *)(current_buffer->buffer + current_buffer->current_position), uuid_info_array_size);
 759                 tsnap->nloadinfos = uuid_info_count;
 760         }
 761
 762         current_buffer->current_position += uuid_info_array_size;
 763
 764         /*
 765          * After the task snapshot & list of binary UUIDs, we place a thread snapshot.
 766          */
 767
 768         if ((current_buffer->size - current_buffer->current_position) < sizeof(struct thread_snapshot)) {
 769                 /* wrap and overwrite */
 770                 current_buffer->end_point = current_record_start;
 771                 current_buffer->current_position = 0;
 772                 if (current_record_start == 0) {
 773                         /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 774                         goto cancel_sample;
 775                 }
 776                 goto copytobuffer;
 777         }
 778
 779         thsnap = (struct thread_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
 780         bzero(thsnap, sizeof(*thsnap));
 781
 782         thsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC;
 783         thsnap->thread_id = thread_tid(thread);
 784         thsnap->state = thread->state;
 785         thsnap->priority = thread->priority;
 786         thsnap->sched_pri = thread->sched_pri;
 787         thsnap->sched_flags = thread->sched_flags;
 788         thsnap->ss_flags |= kStacksPCOnly;
 789         thsnap->ts_qos = thread->effective_policy.thep_qos;
 790
 791         if (thread->effective_policy.darwinbg) {
 792                 thsnap->ss_flags |= kThreadDarwinBG;
 793         }
 794
 795         thsnap->user_time = timer_grab(&thread->user_timer);
 796
 797         uint64_t tval = timer_grab(&thread->system_timer);
 798
 799         if (thread->precise_user_kernel_time) {
 800                 thsnap->system_time = tval;
 801         } else {
 802                 thsnap->user_time += tval;
 803                 thsnap->system_time = 0;
 804         }
 805
 806         current_buffer->current_position += sizeof(struct thread_snapshot);
 807
 808         /*
 809          * If this thread has a dispatch queue serial number, include it here.
 810          */
 811         if (dqserialnum_valid) {
 812                 if ((current_buffer->size - current_buffer->current_position) < sizeof(dqserialnum)) {
 813                         /* wrap and overwrite */
 814                         current_buffer->end_point = current_record_start;
 815                         current_buffer->current_position = 0;
 816                         if (current_record_start == 0) {
 817                                 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 818                                 goto cancel_sample;
 819                         }
 820                         goto copytobuffer;
 821                 }
 822
 823                 thsnap->ss_flags |= kHasDispatchSerial;
 824                 bcopy(&dqserialnum, (char *)current_buffer->buffer + current_buffer->current_position, sizeof (dqserialnum));
 825                 current_buffer->current_position += sizeof (dqserialnum);
 826         }
 827
 828         if (task_has_64BitAddr(task)) {
 829                 framesize = 8;
 830                 thsnap->ss_flags |= kUser64_p;
 831         } else {
 832                 framesize = 4;
 833         }
 834
 835         btcount = cs.nframes;
 836
 837         /*
 838          * If we can't fit this entire stacktrace then cancel this record, wrap to the beginning,
 839          * and start again there so that we always store a full record.
 840          */
 841         if ((current_buffer->size - current_buffer->current_position)/framesize < btcount) {
 842                 current_buffer->end_point = current_record_start;
 843                 current_buffer->current_position = 0;
 844                 if (current_record_start == 0) {
 845                         /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 846                         goto cancel_sample;
 847                 }
 848                 goto copytobuffer;
 849         }
 850
 851         for (bti=0; bti < btcount; bti++, current_buffer->current_position += framesize) {
 852                 if (framesize == 8) {
 853                         *(uint64_t *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position) = cs.frames[bti];
 854                 } else {
 855                         *(uint32_t *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position) = (uint32_t)cs.frames[bti];
 856                 }
 857         }
 858
 859         if (current_buffer->end_point < current_buffer->current_position) {
 860                 /*
 861                  * Each time the cursor wraps around to the beginning, we leave a
 862                  * differing amount of unused space at the end of the buffer. Make
 863                  * sure the cursor pushes the end point in case we're making use of
 864                  * more of the buffer than we did the last time we wrapped.
 865                  */
 866                 current_buffer->end_point = current_buffer->current_position;
 867         }
 868
 869         thsnap->nuser_frames = btcount;
 870
 871         /*
 872          * Now THIS is a hack.
 873          */
 874         if (current_buffer == &telemetry_buffer) {
 875                 telemetry_bytes_since_last_mark += (current_buffer->current_position - current_record_start);
 876                 if (telemetry_bytes_since_last_mark > telemetry_buffer_notify_at) {
 877                         notify = TRUE;
 878                 }
 879         }
 880
 881 cancel_sample:
 882
 883         TELEMETRY_UNLOCK();
 884
 885         /* TODO */
 886         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_END, notify, telemetry_bytes_since_last_mark, current_buffer->current_position, current_buffer->end_point, (&telemetry_buffer != current_buffer));
 887
 888         if (notify) {
 889                 telemetry_notify_user();
 890         }
 891
 892         if (uuid_info_array != NULL) {
 893                 kfree(uuid_info_array, uuid_info_array_size);
 894         }
 895 }
 896
 897 #if TELEMETRY_DEBUG
 898 static void
 899 log_telemetry_output(vm_offset_t buf, uint32_t pos, uint32_t sz)
 900 {
 901         struct micro_snapshot *p;
 902         uint32_t offset;
 903
 904         printf("Copying out %d bytes of telemetry at offset %d\n", sz, pos);
 905
 906         buf += pos;
 907
 908         /*
 909          * Find and log each timestamp in this chunk of buffer.
 910          */
 911         for (offset = 0; offset < sz; offset++) {
 912                 p = (struct micro_snapshot *)(buf + offset);
 913                 if (p->snapshot_magic == STACKSHOT_MICRO_SNAPSHOT_MAGIC) {
 914                         printf("telemetry timestamp: %lld\n", p->ms_time);
 915                 }
 916         }
 917 }
 918 #endif
 919
 920 int telemetry_gather(user_addr_t buffer, uint32_t *length, boolean_t mark)
 921 {
 922         return telemetry_buffer_gather(buffer, length, mark, &telemetry_buffer);
 923 }
 924
 925 int telemetry_gather_windowed(user_addr_t buffer, uint32_t *length)
 926 {
 927         return telemetry_buffer_gather(buffer, length, 0, &window_buffer);
 928 }
 929
 930 int telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark, struct micro_snapshot_buffer * current_buffer)
 931 {
 932         int result = 0;
 933         uint32_t oldest_record_offset;
 934
 935         /* TODO */
 936         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_START, mark, telemetry_bytes_since_last_mark, 0, 0, (&telemetry_buffer != current_buffer));
 937
 938         TELEMETRY_LOCK();
 939
 940         if (current_buffer->buffer == 0) {
 941                 *length = 0;
 942                 goto out;
 943         }
 944
 945         if (*length < current_buffer->size) {
 946                 result = KERN_NO_SPACE;
 947                 goto out;
 948         }
 949
 950         /*
 951          * Copy the ring buffer out to userland in order sorted by time: least recent to most recent.
 952          * First, we need to search forward from the cursor to find the oldest record in our buffer.
 953          */
 954         oldest_record_offset = current_buffer->current_position;
 955         do {
 956                 if (((oldest_record_offset + sizeof(uint32_t)) > current_buffer->size) ||
 957                     ((oldest_record_offset + sizeof(uint32_t)) > current_buffer->end_point)) {
 958
 959                         if (*(uint32_t *)(uintptr_t)(current_buffer->buffer) == 0) {
 960                                 /*
 961                                  * There is no magic number at the start of the buffer, which means
 962                                  * it's empty; nothing to see here yet.
 963                                  */
 964                                 *length = 0;
 965                                 goto out;
 966                         }
 967                         /*
 968                          * We've looked through the end of the active buffer without finding a valid
 969                          * record; that means all valid records are in a single chunk, beginning at
 970                          * the very start of the buffer.
 971                          */
 972
 973                         oldest_record_offset = 0;
 974                         assert(*(uint32_t *)(uintptr_t)(current_buffer->buffer) == STACKSHOT_MICRO_SNAPSHOT_MAGIC);
 975                         break;
 976                 }
 977
 978                 if (*(uint32_t *)(uintptr_t)(current_buffer->buffer + oldest_record_offset) == STACKSHOT_MICRO_SNAPSHOT_MAGIC)
 979                         break;
 980
 981                 /*
 982                  * There are no alignment guarantees for micro-stackshot records, so we must search at each
 983                  * byte offset.
 984                  */
 985                 oldest_record_offset++;
 986         } while (oldest_record_offset != current_buffer->current_position);
 987
 988         /*
 989          * If needed, copyout in two chunks: from the oldest record to the end of the buffer, and then
 990          * from the beginning of the buffer up to the current position.
 991          */
 992         if (oldest_record_offset != 0) {
 993 #if TELEMETRY_DEBUG
 994                 log_telemetry_output(current_buffer->buffer, oldest_record_offset,
 995                                      current_buffer->end_point - oldest_record_offset);
 996 #endif
 997                 if ((result = copyout((void *)(current_buffer->buffer + oldest_record_offset), buffer,
 998                     current_buffer->end_point - oldest_record_offset)) != 0) {
 999                         *length = 0;
1000                         goto out;
1001                 }
1002                 *length = current_buffer->end_point - oldest_record_offset;
1003         } else {
1004                 *length = 0;
1005         }
1006
1007 #if TELEMETRY_DEBUG
1008         log_telemetry_output(current_buffer->buffer, 0, current_buffer->current_position);
1009 #endif
1010         if ((result = copyout((void *)current_buffer->buffer, buffer + *length,
1011             current_buffer->current_position)) != 0) {
1012                 *length = 0;
1013                 goto out;
1014         }
1015         *length += (uint32_t)current_buffer->current_position;
1016
1017 out:
1018
1019         if (mark && (*length > 0)) {
1020                 telemetry_bytes_since_last_mark = 0;
1021         }
1022
1023         TELEMETRY_UNLOCK();
1024
1025         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_END, current_buffer->current_position, *length, current_buffer->end_point, 0, (&telemetry_buffer != current_buffer));
1026
1027         return (result);
1028 }
1029
1030 /************************/
1031 /* BOOT PROFILE SUPPORT */
1032 /************************/
1033 /*
1034  * Boot Profiling
1035  *
1036  * The boot-profiling support is a mechanism to sample activity happening on the
1037  * system during boot. This mechanism sets up a periodic timer and on every timer fire,
1038  * captures a full backtrace into the boot profiling buffer. This buffer can be pulled
1039  * out and analyzed from user-space. It is turned on using the following boot-args:
1040  * "bootprofile_buffer_size" specifies the size of the boot profile buffer
1041  * "bootprofile_interval_ms" specifies the interval for the profiling timer
1042  *
1043  * Process Specific Boot Profiling
1044  *
1045  * The boot-arg "bootprofile_proc_name" can be used to specify a certain
1046  * process that needs to profiled during boot. Setting this boot-arg changes
1047  * the way stackshots are captured. At every timer fire, the code looks at the
1048  * currently running process and takes a stackshot only if the requested process
1049  * is on-core (which makes it unsuitable for MP systems).
1050  *
1051  * Trigger Events
1052  *
1053  * The boot-arg "bootprofile_type=boot" starts the timer during early boot. Using
1054  * "wake" starts the timer at AP wake from suspend-to-RAM.
1055  */
1056
1057 #define BOOTPROFILE_MAX_BUFFER_SIZE (64*1024*1024) /* see also COPYSIZELIMIT_PANIC */
1058
1059 vm_offset_t                     bootprofile_buffer = 0;
1060 uint32_t                        bootprofile_buffer_size = 0;
1061 uint32_t                        bootprofile_buffer_current_position = 0;
1062 uint32_t                        bootprofile_interval_ms = 0;
1063 uint64_t                        bootprofile_interval_abs = 0;
1064 uint64_t                        bootprofile_next_deadline = 0;
1065 uint32_t                        bootprofile_all_procs = 0;
1066 char                            bootprofile_proc_name[17];
1067
1068 lck_grp_t               bootprofile_lck_grp;
1069 lck_mtx_t               bootprofile_mtx;
1070
1071 enum {
1072         kBootProfileDisabled = 0,
1073         kBootProfileStartTimerAtBoot,
1074         kBootProfileStartTimerAtWake
1075 } bootprofile_type = kBootProfileDisabled;
1076
1077
1078 static timer_call_data_t        bootprofile_timer_call_entry;
1079
1080 #define BOOTPROFILE_LOCK() do { lck_mtx_lock(&bootprofile_mtx); } while(0)
1081 #define BOOTPROFILE_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&bootprofile_mtx)
1082 #define BOOTPROFILE_UNLOCK() do { lck_mtx_unlock(&bootprofile_mtx); } while(0)
1083
1084 static void bootprofile_timer_call(
1085         timer_call_param_t      param0,
1086         timer_call_param_t      param1);
1087
1088 extern int
1089 stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags, unsigned *retbytes);
1090
1091 void bootprofile_init(void)
1092 {
1093         kern_return_t ret;
1094         char type[32];
1095
1096         lck_grp_init(&bootprofile_lck_grp, "bootprofile group", LCK_GRP_ATTR_NULL);
1097         lck_mtx_init(&bootprofile_mtx, &bootprofile_lck_grp, LCK_ATTR_NULL);
1098
1099         if (!PE_parse_boot_argn("bootprofile_buffer_size", &bootprofile_buffer_size, sizeof(bootprofile_buffer_size))) {
1100                 bootprofile_buffer_size = 0;
1101         }
1102
1103         if (bootprofile_buffer_size > BOOTPROFILE_MAX_BUFFER_SIZE)
1104                 bootprofile_buffer_size = BOOTPROFILE_MAX_BUFFER_SIZE;
1105
1106         if (!PE_parse_boot_argn("bootprofile_interval_ms", &bootprofile_interval_ms, sizeof(bootprofile_interval_ms))) {
1107                 bootprofile_interval_ms = 0;
1108         }
1109
1110         if (!PE_parse_boot_argn("bootprofile_proc_name", &bootprofile_proc_name, sizeof(bootprofile_proc_name))) {
1111                 bootprofile_all_procs = 1;
1112                 bootprofile_proc_name[0] = '\0';
1113         }
1114
1115         if (PE_parse_boot_argn("bootprofile_type", type, sizeof(type))) {
1116                 if (0 == strcmp(type, "boot")) {
1117                         bootprofile_type = kBootProfileStartTimerAtBoot;
1118                 } else if (0 == strcmp(type, "wake")) {
1119                         bootprofile_type = kBootProfileStartTimerAtWake;
1120                 } else {
1121                         bootprofile_type = kBootProfileDisabled;
1122                 }
1123         } else {
1124                 bootprofile_type = kBootProfileDisabled;
1125         }
1126
1127         clock_interval_to_absolutetime_interval(bootprofile_interval_ms, NSEC_PER_MSEC, &bootprofile_interval_abs);
1128
1129         /* Both boot args must be set to enable */
1130         if ((bootprofile_type == kBootProfileDisabled) || (bootprofile_buffer_size == 0) || (bootprofile_interval_abs == 0)) {
1131                 return;
1132         }
1133
1134         ret = kmem_alloc(kernel_map, &bootprofile_buffer, bootprofile_buffer_size);
1135         if (ret != KERN_SUCCESS) {
1136                 kprintf("Boot profile: Allocation failed: %d\n", ret);
1137                 return;
1138         }
1139         bzero((void *) bootprofile_buffer, bootprofile_buffer_size);
1140
1141         kprintf("Boot profile: Sampling %s once per %u ms at %s\n", bootprofile_all_procs ? "all procs" : bootprofile_proc_name,  bootprofile_interval_ms,
1142                         bootprofile_type == kBootProfileStartTimerAtBoot ? "boot" : (bootprofile_type == kBootProfileStartTimerAtWake ? "wake" : "unknown"));
1143
1144         timer_call_setup(&bootprofile_timer_call_entry,
1145                                          bootprofile_timer_call,
1146                                          NULL);
1147
1148         if (bootprofile_type == kBootProfileStartTimerAtBoot) {
1149                 bootprofile_next_deadline = mach_absolute_time() + bootprofile_interval_abs;
1150                 timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1151                                                                          NULL,
1152                                                                          bootprofile_next_deadline,
1153                                                                          0,
1154                                                                          TIMER_CALL_SYS_NORMAL,
1155                                                                          FALSE);
1156         }
1157 }
1158
1159 void
1160 bootprofile_wake_from_sleep(void)
1161 {
1162         if (bootprofile_type == kBootProfileStartTimerAtWake) {
1163                 bootprofile_next_deadline = mach_absolute_time() + bootprofile_interval_abs;
1164                 timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1165                                                                          NULL,
1166                                                                          bootprofile_next_deadline,
1167                                                                          0,
1168                                                                          TIMER_CALL_SYS_NORMAL,
1169                                                                          FALSE);
1170         }
1171 }
1172
1173
1174 static void bootprofile_timer_call(
1175         timer_call_param_t      param0 __unused,
1176         timer_call_param_t      param1 __unused)
1177 {
1178         unsigned retbytes = 0;
1179         int pid_to_profile = -1;
1180
1181         if (!BOOTPROFILE_TRY_SPIN_LOCK()) {
1182                 goto reprogram;
1183         }
1184
1185         /* Check if process-specific boot profiling is turned on */
1186         if (!bootprofile_all_procs) {
1187                 /*
1188                  * Since boot profiling initializes really early in boot, it is
1189                  * possible that at this point, the task/proc is not initialized.
1190                  * Nothing to do in that case.
1191                  */
1192
1193                 if ((current_task() != NULL) && (current_task()->bsd_info != NULL) &&
1194                     (0 == strncmp(bootprofile_proc_name, proc_name_address(current_task()->bsd_info), 17))) {
1195                         pid_to_profile = proc_selfpid();
1196                 }
1197                 else {
1198                         /*
1199                          * Process-specific boot profiling requested but the on-core process is
1200                          * something else. Nothing to do here.
1201                          */
1202                         BOOTPROFILE_UNLOCK();
1203                         goto reprogram;
1204                 }
1205         }
1206
1207         /* initiate a stackshot with whatever portion of the buffer is left */
1208         if (bootprofile_buffer_current_position < bootprofile_buffer_size) {
1209                 stack_snapshot_from_kernel(
1210                         pid_to_profile,
1211                         (void *)(bootprofile_buffer + bootprofile_buffer_current_position),
1212                         bootprofile_buffer_size - bootprofile_buffer_current_position,
1213                         STACKSHOT_SAVE_LOADINFO | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS,
1214             &retbytes
1215                         );
1216
1217                 bootprofile_buffer_current_position += retbytes;
1218         }
1219
1220         BOOTPROFILE_UNLOCK();
1221
1222         /* If we didn't get any data or have run out of buffer space, stop profiling */
1223         if ((retbytes == 0) || (bootprofile_buffer_current_position == bootprofile_buffer_size)) {
1224                 return;
1225         }
1226
1227
1228 reprogram:
1229         /* If the user gathered the buffer, no need to keep profiling */
1230         if (bootprofile_interval_abs == 0) {
1231                 return;
1232         }
1233
1234         clock_deadline_for_periodic_event(bootprofile_interval_abs,
1235                                                                           mach_absolute_time(),
1236                                                                           &bootprofile_next_deadline);
1237         timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1238                                                                  NULL,
1239                                                                  bootprofile_next_deadline,
1240                                                                  0,
1241                                                                  TIMER_CALL_SYS_NORMAL,
1242                                                                  FALSE);
1243 }
1244
1245 int bootprofile_gather(user_addr_t buffer, uint32_t *length)
1246 {
1247         int result = 0;
1248
1249         BOOTPROFILE_LOCK();
1250
1251         if (bootprofile_buffer == 0) {
1252                 *length = 0;
1253                 goto out;
1254         }
1255
1256         if (*length < bootprofile_buffer_current_position) {
1257                 result = KERN_NO_SPACE;
1258                 goto out;
1259         }
1260
1261         if ((result = copyout((void *)bootprofile_buffer, buffer,
1262             bootprofile_buffer_current_position)) != 0) {
1263                 *length = 0;
1264                 goto out;
1265         }
1266         *length = bootprofile_buffer_current_position;
1267
1268         /* cancel future timers */
1269         bootprofile_interval_abs = 0;
1270
1271 out:
1272
1273         BOOTPROFILE_UNLOCK();
1274
1275         return (result);
1276 }