X-Git-Url: https://git.saurik.com/apple/system_cmds.git/blobdiff_plain/1a7e3f61d38d679bba59130891c2031b5a0092b6..bd6521f0fc816ab056bc71376f9706a69b3b52c1:/KDBG/Machine.mutable-impl.hpp diff --git a/KDBG/Machine.mutable-impl.hpp b/KDBG/Machine.mutable-impl.hpp new file mode 100644 index 0000000..314ecd0 --- /dev/null +++ b/KDBG/Machine.mutable-impl.hpp @@ -0,0 +1,1128 @@ +// +// Machine.mutable-impl.hpp +// KDBG +// +// Created by James McIlree on 10/30/12. +// Copyright (c) 2014 Apple. All rights reserved. +// + +#include "KDebug.h" + +template +MachineProcess* Machine::create_process(pid_t pid, const char* name, AbsTime create_timestamp, kMachineProcessFlag flags) { + ASSERT(name, "Sanity"); + + // Validate that we are not creating a process that already exists + DEBUG_ONLY({ + ASSERT(_processes_by_pid.size() == _processes_by_name.size(), "Sanity"); + ASSERT(_processes_by_pid.size() == _processes_by_time.size(), "Sanity"); + + auto by_pid_range = _processes_by_pid.equal_range(pid); + for (auto it = by_pid_range.first; it != by_pid_range.second; ++it) { + MachineProcess& process = it->second; + ASSERT(!process.timespan().contains(create_timestamp), "Creating a process that overlaps an existing process"); + } + + auto by_name_range = _processes_by_name.equal_range(name); + for (auto it = by_name_range.first; it != by_name_range.second; ++it) { + MachineProcess& process = *it->second; + ASSERT((process.pid() != pid) || (!process.timespan().contains(create_timestamp)), "Creating a process that overlaps an existing process"); + } + + // The "by time" vector is unsorted during construction, we have to look at everything. + for (MachineProcess* process : _processes_by_time) { + if (process->pid() == pid) { + ASSERT(!process->timespan().contains(create_timestamp), "Creating a process that overlaps an existing process"); + } + } + }) + + if (MachineProcess* about_to_be_reused_process = youngest_mutable_process(pid)) { + // If this process is still alive, we're going to be replacing it. + // The only legal way of doing that is an exec. Validate. + if (!about_to_be_reused_process->is_trace_terminated()) { + DEBUG_ONLY({ + ASSERT((uint32_t)flags & ((uint32_t)kMachineProcessFlag::CreatedByForkExecEvent | (uint32_t)kMachineProcessFlag::CreatedByExecEvent), + "Replacing existing process without exec or fork-exec"); + }) + // + // Exit by exec is unique in that we will have two processes/threads + // back to back in the timeline. We do not want them to overlap, and + // the new process timeline is half open , and will have this time as + // its creation. Pass a 1 mabs older time to exit to prevent overlap + about_to_be_reused_process->set_exit_by_exec(create_timestamp - AbsTime(1)); + } + ASSERT(about_to_be_reused_process->is_trace_terminated(), "Sanity"); + } + + MachineProcess* process = &_processes_by_pid.emplace(pid, MachineProcess(pid, name, create_timestamp, flags))->second; + _processes_by_name.emplace(process->name(), process); + _processes_by_time.push_back(process); + + return process; +} + +template +MachineThread* Machine::create_thread(MachineProcess* process, typename SIZE::ptr_t tid, MachineVoucher* voucher, AbsTime create_timestamp, kMachineThreadFlag flags) { + ASSERT(process, "Sanity"); + + // Validate that we are not creating a thread that already exists + DEBUG_ONLY({ + ASSERT(_threads_by_tid.size() == _threads_by_time.size(), "Sanity"); + + auto by_tid_range = _threads_by_tid.equal_range(tid); + for (auto it = by_tid_range.first; it != by_tid_range.second; ++it) { + MachineThread& thread = it->second; + ASSERT(!thread.timespan().contains(create_timestamp), "Creating a thread that overlaps an existing thread"); + } + + // The "by time" vector is unsorted during construction, we have to look at everything + for (MachineThread* thread : _threads_by_time) { + if (thread->tid() == tid) { + ASSERT(!thread->timespan().contains(create_timestamp), "Creating a thread that overlaps an existing thread"); + } + } + }) + + // Currently the only way we intentionally re-use live threads is via exec/fork-exec. + // The exec/fork-exec code calls create_process first, which should mark all existing + // threads as trace-terminated. So we should NEVER see a live thread at this point. + // validate. + DEBUG_ONLY({ + if (MachineThread* about_to_be_reused_thread = youngest_mutable_thread(tid)) { + ASSERT(about_to_be_reused_thread->is_trace_terminated(), "Expected this thread to be terminated"); + } + }); + + MachineThread* thread = &_threads_by_tid.emplace(tid, MachineThread(process, tid, voucher, create_timestamp, flags))->second; + _threads_by_time.push_back(thread); + + return thread; +} + +template +MachineVoucher* Machine::create_voucher(typename SIZE::ptr_t address, AbsTime create_timestamp, kMachineVoucherFlag flags, uint32_t content_bytes_capacity) { + ASSERT(address, "Should not be NULL"); + ASSERT(content_bytes_capacity < 4096, "Probably an error"); // This is a guesstimate, may need re-evaluation + + MachineVoucher* voucher; + + ASSERT(_voucher_nursery.find(address) == _voucher_nursery.end(), "Attempt to create an already live voucher ()"); + // + // There is no real workaround for this. Other tracepoints will use the address, bad things happen. You can't fix ordering bugs with cleverness outside the lock :-). + // + // voucher create / destroy tracepoints are outside the hashtable lock + + auto workaround_it = _voucher_nursery.find(address); + if (workaround_it != _voucher_nursery.end()) { + // We've hit a race condition, this voucher was used before the create event was posted. + // We want to update the content_bytes_capacity, but not the create_timestamp. + voucher = workaround_it->second.get(); + voucher->workaround_16898190(flags, content_bytes_capacity); + } else { + auto it = _voucher_nursery.emplace(address, std::make_unique>(address, AbsInterval(create_timestamp, AbsTime(0)), flags, content_bytes_capacity)); + ASSERT(it.second, "Voucher emplace in nursery failed"); + voucher = it.first->second.get(); + } + + ASSERT(voucher->is_live(), "Sanity"); + ASSERT(!voucher->is_null(), "Sanity"); + ASSERT(!voucher->is_unset(), "Sanity"); + + return voucher; +} + +template +void Machine::destroy_voucher(typename SIZE::ptr_t address, AbsTime timestamp) { + ASSERT(address, "Should not be NULL"); + + auto nursery_it = _voucher_nursery.find(address); + + // We need a voucher for every reference, so we are in the odd position + // of creating a voucher so we can destroy it. + if (nursery_it == _voucher_nursery.end()) { + create_voucher(address, AbsTime(0), kMachineVoucherFlag::CreatedByFirstUse, 0); + nursery_it = _voucher_nursery.find(address); + } + + MachineVoucher* voucher = nursery_it->second.get(); + + voucher->set_destroyed(timestamp); + + // First find the "row" for this address. + auto by_addr_it = _vouchers_by_addr.find(address); + if (by_addr_it == _vouchers_by_addr.end()) { + // No address entry case + //std::vector>> row(std::move(nursery_it->second)); + std::vector>> row; + row.emplace_back(std::move(nursery_it->second)); + _vouchers_by_addr.emplace(address, std::move(row)); + } else { + auto& row = by_addr_it->second; + + // Make sure these are sorted and non-overlapping + ASSERT(row.back()->timespan() < voucher->timespan(), "Sanity"); + ASSERT(!row.back()->timespan().intersects(voucher->timespan()), "Sanity"); + + row.emplace_back(std::move(nursery_it->second)); + } + + _voucher_nursery.erase(nursery_it); +} + +// +// This function handles looking up a voucher by address. If neccessary, it will create a new voucher. +// NOTE! Does not update voucher timestamps, that is only done at voucher destroy. +// + +template +MachineVoucher* Machine::process_event_voucher_lookup(typename SIZE::ptr_t address, uint32_t msgh_bits) { + // NOTE! There is a subtle race here, we *MUST* test msgh_bits before + // checking for a 0 address. An unset voucher may have address 0... + if (MACH_MSGH_BITS_VOUCHER(msgh_bits) == MACH_MSGH_BITS_ZERO) + return &UnsetVoucher; + + if (address == 0) + return &NullVoucher; + + auto nursery_it = _voucher_nursery.find(address); + if (nursery_it == _voucher_nursery.end()) { + // If no voucher exists, create a default (no-contents!) voucher. + return create_voucher(address, AbsTime(0), kMachineVoucherFlag::CreatedByFirstUse, 0); + } + + return nursery_it->second.get(); +} + +template +MachineVoucher* Machine::thread_forwarding_voucher_lookup(const MachineVoucher* previous_machine_state_voucher) { + ASSERT(previous_machine_state_voucher, "Sanity"); + + if (previous_machine_state_voucher == &UnsetVoucher) + return &UnsetVoucher; + + if (previous_machine_state_voucher == &NullVoucher) + return &NullVoucher; + + auto nursery_it = _voucher_nursery.find(previous_machine_state_voucher->address()); + if (nursery_it == _voucher_nursery.end()) { + ASSERT(false, "Should not ever have a thread forwarding a voucher not in the nursery"); + return &UnsetVoucher; + } + + return nursery_it->second.get(); +} + +// +// This is used by processes that are being fork-exec'd / exec'd. They must be +// created with some name, but it isn't their final name. For now, we are +// heavily ASSERTING state to only allow processes which are fork-exec'd / +// exec'd to set their name. +// +template +void Machine::set_process_name(MachineProcess* process, const char* name) { + ASSERT(process, "Sanity"); + ASSERT(process->is_created_by_fork_exec() || process->is_created_by_exec(), "Sanity"); + ASSERT(process->threads().size() == 1, "Sanity"); + ASSERT(process->is_fork_exec_in_progress() || process->is_exec_in_progress(), "Sanity"); + ASSERT(name, "Sanity"); + + auto by_name_range = _processes_by_name.equal_range(process->name()); + for (auto it = by_name_range.first; it != by_name_range.second; ++it) { + if (process == it->second) { + _processes_by_name.erase(it); + process->set_name(name); + _processes_by_name.emplace(process->name(), process); + return; + } + } + + ASSERT(false, "Attempt to rename did not find a matching process"); +} + +// +// The "youngest" process/thread lookups are used during event processing, +// where we often must look up a process/thread that hasn't been updated +// to reflect current timespans. A time based lookup would fail. +// +template +MachineProcess* Machine::youngest_mutable_process(pid_t pid) { + MachineProcess* youngest_process = nullptr; + auto by_pid_range = _processes_by_pid.equal_range(pid); + for (auto it = by_pid_range.first; it != by_pid_range.second; ++it) { + MachineProcess& process = it->second; + // Larger times are newer (younger) + if (!youngest_process || process.timespan().location() > youngest_process->timespan().location()) { + youngest_process = &process; + } + } + + return youngest_process; +} + +template +MachineThread* Machine::youngest_mutable_thread(typename SIZE::ptr_t tid) { + MachineThread* youngest_thread = nullptr; + auto by_tid_range = _threads_by_tid.equal_range(tid); + for (auto it = by_tid_range.first; it != by_tid_range.second; ++it) { + MachineThread& thread = it->second; + // Larger times are newer (younger) + if (!youngest_thread || thread.timespan().location() > youngest_thread->timespan().location()) { + youngest_thread = &thread; + } + } + + return youngest_thread; +} + +// +// This function handles looking up a thread by tid. If neccessary, it will create a new thread +// and process. Any thread / process that are looked up or created will have their timestamps updated. +// +template +MachineThread* Machine::process_event_tid_lookup(typename SIZE::ptr_t tid, AbsTime now) { + MachineThread* thread = youngest_mutable_thread(tid); + + if (!thread) { + // This is an "unknown" thread. We have no information about its name or parent process. + char unknown_process_name[20]; + snprintf(unknown_process_name, sizeof(unknown_process_name), "unknown-%llX", (uint64_t)tid); + + // + // Strongly considering just requiring this to be valid, and never allowing an unknown thread. + // + printf("UNKNOWN TID FAIL! unknonwn tid %llx\n", (int64_t)tid); + ASSERT(false, "unknown TID fail"); + + MachineProcess* unknown_process = create_process(next_unknown_pid(), unknown_process_name, now, kMachineProcessFlag::IsUnknownProcess); + thread = create_thread(unknown_process, tid, &UnsetVoucher, now, kMachineThreadFlag::CreatedByUnknownTidInTrace); + unknown_process->add_thread(thread); + } + + ASSERT(thread, "Sanity"); + ASSERT(!thread->is_trace_terminated(), "Event tid seen after trace termination"); + ASSERT(!thread->process().is_trace_terminated(), "Event pid seen after trace termination"); + + return thread; +} + +// +// See comments in task_policy.c for full explanation of trequested_0 & trequested_1. +// +// process_trequested_task means that the tracepoint either had a NULL thread, or specified that the tracepoint was targeted at task level. +// This only matters in 32 bit traces, where it takes both trequested_0 and trequested_1 to carry task or thread requested data. +// +// For now, there is nothing we want to see in thread_requested data. +// +template +void Machine::process_trequested_task(pid_t pid, typename SIZE::ptr_t trequested_0, typename SIZE::ptr_t trequested_1) { + TaskRequestedPolicy task_requested = (SIZE::is_64_bit) ? TaskRequestedPolicy(trequested_0) : TaskRequestedPolicy((Kernel32::ptr_t)trequested_0, (Kernel32::ptr_t)trequested_1); + + if (uint32_t apptype = (uint32_t)task_requested.as_struct().t_apptype) { + if (pid) { + if (MachineProcess* target = youngest_mutable_process(pid)) { + target->set_apptype_from_trequested(apptype); + } + } + } +} + +template +void Machine::process_trequested_thread(typename SIZE::ptr_t tid, typename SIZE::ptr_t trequested_0, typename SIZE::ptr_t trequested_1) { + TaskRequestedPolicy task_requested = (SIZE::is_64_bit) ? TaskRequestedPolicy(trequested_0) : TaskRequestedPolicy((Kernel32::ptr_t)trequested_0, (Kernel32::ptr_t)trequested_1); + + if (uint32_t apptype = (uint32_t)task_requested.as_struct().t_apptype) { + if (MachineThread* target_thread = youngest_mutable_thread(tid)) { + target_thread->mutable_process().set_apptype_from_trequested(apptype); + } + } +} + +#define AST_PREEMPT 0x01 +#define AST_QUANTUM 0x02 +#define AST_URGENT 0x04 +#define AST_HANDOFF 0x08 +#define AST_YIELD 0x10 + +#define TRACE_DATA_NEWTHREAD 0x07000004 +#define TRACE_STRING_NEWTHREAD 0x07010004 +#define TRACE_DATA_EXEC 0x07000008 +#define TRACE_STRING_EXEC 0x07010008 +#define TRACE_LOST_EVENTS 0x07020008 + +// From ./osfmk/i386/mp.c +#define TRACE_MP_CPU_DEACTIVATE MACHDBG_CODE(DBG_MACH_MP, 7) + +// From osfmk/kern/task.h +#define TASK_POLICY_TASK 0x4 +#define TASK_POLICY_THREAD 0x8 + +template +bool Machine::process_event(const KDEvent& event) +{ + ASSERT(!lost_events(), "Should not be processing events after TRACE_LOST_EVENTS"); + + AbsTime now = event.timestamp(); + ASSERT(event.cpu() > -1 && event.cpu() < _cpus.size(), "cpu_id out of range"); + MachineCPU& cpu = _cpus[event.cpu()]; + + if (!cpu.is_iop()) { + // + // If we have lost events, immediately bail. + // + // Pre-process events known to have bogus TID's: + // + // DBG_TRACE_INFO events may not have a valid TID. + // MACH_IPC_VOUCHER_CREATE_ATTR_DATA do not have a valid TID, + // + + switch (event.dbg_cooked()) { + case TRACEDBG_CODE(DBG_TRACE_INFO, 1): // kernel_debug_early_end() + case TRACEDBG_CODE(DBG_TRACE_INFO, 4): // kernel_debug_string() + return true; + + case TRACEDBG_CODE(DBG_TRACE_INFO, 2): // TRACE_LOST_EVENTS + set_flags(kMachineFlag::LostEvents); + return false; + + case MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_VOUCHER_CREATE_ATTR_DATA): + // trace event data is + // data, data, data, data + // + // event tid is voucher address! + if (auto voucher = process_event_voucher_lookup(event.tid(), UINT32_MAX)) { + voucher->add_content_bytes(event.arg1_as_pointer()); + } + return true; + + default: + break; + } + + MachineThread* event_thread = process_event_tid_lookup(event.tid(), now); + + switch (event.dbg_cooked()) { + case BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT): // the exit syscall never returns, use this instead + // arg1 == pid exiting + // arg2 == exit code + if (event.is_func_end()) { + // can BSD_PROC_EXIT can be called from another context? + ASSERT((pid_t)event.arg1() == event_thread->process().pid(), "proc_exit pid does not match context pid"); + event_thread->mutable_process().set_exit_by_syscall(now, (int32_t)event.arg2()); + } + break; + + case TRACE_DATA_NEWTHREAD: { + ASSERT(event.is_func_none(), "TRACE_DATA_NEWTHREAD event has start/end bit set. Should be func_none."); + // + // This is called by the thread creating the new thread. + // + // The thread id of the new thread is in arg1. + // The pid of the process creating the new thread is in arg2. + // + // NOTE! This event carries enough information to create a thread, which we do. + // However, the immediately following TRACE_STRING_NEWTHREAD does not have the + // newly_created thread's tid. We cannot assume that we will always be able to + // read these events as a pair, they may be split by a particularly unlucky + // buffer snapshot. + // + // We have a "thread nursery" which we use to associate the tid of the new + // thread with the creating thread. + // + // (During fork operations, the "parent" may appear different than the child, + // this is why we cannot reuse the parent's name and ignore the STRING event.) + // + auto new_thread_id = (typename SIZE::ptr_t)event.arg1(); + auto new_thread_pid = (int32_t)event.arg2(); + + MachineProcess* new_process = youngest_mutable_process(new_thread_pid); + kMachineThreadFlag new_thread_flags; + + // + // Okay, it looks like we cannot pay much attention to the source of thread + // creates, the system will create a thread for anyone at any time, in any + // place. The new model is to lookup the pid of the new thread, and if it + // exists and is live, use that. Otherwise, fork-exec a new process. + // + + if (new_process) { + new_thread_flags = kMachineThreadFlag::CreatedByTraceDataNewThread; + } else { + new_thread_flags = (kMachineThreadFlag)((uint32_t)kMachineThreadFlag::CreatedByForkExecEvent | + (uint32_t)kMachineThreadFlag::IsMain); + + auto new_process_flags = (kMachineProcessFlag)((uint32_t)kMachineProcessFlag::CreatedByForkExecEvent | + (uint32_t)kMachineProcessFlag::IsForkExecInProgress); + + // Create the new process + new_process = create_process(new_thread_pid, "###Fork#Exec###", now, new_process_flags); + } + ASSERT(new_process, "Sanity"); + ASSERT(!new_process->is_trace_terminated(), "Sanity"); + ASSERT(new_thread_pid != 0 || new_process == _kernel_task, "Sanity"); + new_process->add_thread(create_thread(new_process, new_thread_id, &UnsetVoucher, now, new_thread_flags)); + break; + } + + case TRACEDBG_CODE(DBG_TRACE_DATA, TRACE_DATA_THREAD_TERMINATE): { + ASSERT(event.is_func_none(), "Sanity"); + typename SIZE::ptr_t terminated_tid = event.arg1(); + // If tid == terminated_tid, we need to handle the lookup below differently + ASSERT(event.tid() != terminated_tid, "Should not be possible"); + MachineThread* terminated_thread = process_event_tid_lookup(terminated_tid, now); + terminated_thread->set_trace_terminated(now); + + // Was this the last thread for a given process? + bool all_threads_trace_terminated = true; + MachineProcess& process = terminated_thread->mutable_process(); + for (auto thread : process.threads()) { + if (!thread->is_trace_terminated()) { + all_threads_trace_terminated = false; + break; + } + } + + if (all_threads_trace_terminated) { + process.set_trace_terminated(now); + } + break; + } + + case TRACE_DATA_EXEC: { + ASSERT(event.is_func_none(), "TRACE_DATA_EXEC event has start/end bit set. Should be func_none."); + + ASSERT(!event_thread->is_trace_terminated(), "Thread that is trace terminated is exec'ing"); + ASSERT(!event_thread->process().is_kernel(), "Kernel process is exec'ing"); + ASSERT(!event_thread->is_idle_thread(), "IDLE thread is exec'ing"); + + // arg1 == pid + int32_t exec_pid = (int32_t)event.arg1(); + ASSERT(exec_pid != -1, "Kernel thread is exec'ing"); + ASSERT(exec_pid == event_thread->process().pid() || event_thread->process().is_unknown(), "Pids should match. If not, maybe vfork?"); + + if (event_thread->process().is_fork_exec_in_progress()) { + ASSERT(event_thread->process().threads().size() == 1, "Fork invariant violated"); + // event_thread->mutable_process().clear_fork_exec_in_progress(); + + // Hmmm.. Do we need to propagate an apptype here? + } else { + // + // Creating a new process will automagically clean up the + // existing one, setting the last known timestamp, and "PidReused" + // + auto exec_thread_flags = (kMachineThreadFlag)((uint32_t)kMachineThreadFlag::CreatedByExecEvent | + (uint32_t)kMachineThreadFlag::IsMain); + + auto exec_process_flags = (kMachineProcessFlag)((uint32_t)kMachineProcessFlag::CreatedByExecEvent | + (uint32_t)kMachineProcessFlag::IsExecInProgress); + + auto exec_process = create_process(exec_pid, "###Exec###", now, exec_process_flags); + MachineThread* exec_thread = create_thread(exec_process, event_thread->tid(), &UnsetVoucher, now, exec_thread_flags); + exec_process->add_thread(exec_thread); + + int32_t apptype = event_thread->process().apptype(); + if (apptype != -1) { + exec_process->set_apptype(apptype); + } + } + break; + } + + case TRACE_STRING_EXEC: { + ASSERT(event.is_func_none(), "TRACE_STRING_EXEC event has start/end bit set. Should be func_none."); + ASSERT(event_thread->mutable_process().is_exec_in_progress() || + event_thread->mutable_process().is_fork_exec_in_progress(), "Must be exec or fork-exec in progress to be here"); + + set_process_name(&event_thread->mutable_process(), event.all_args_as_string().c_str()); + + if (event_thread->process().is_exec_in_progress()) + event_thread->mutable_process().clear_exec_in_progress(); + else + event_thread->mutable_process().clear_fork_exec_in_progress(); + break; + } + + case MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0): + if (event.is_func_start()) { + cpu.set_intr(now); + } else { + cpu.clear_intr(now); + } + break; + + case MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED): + case MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF): { + // The is deactivate switch to idle thread should have happened before we see an actual + // context switch for this cpu. + ASSERT(!cpu.is_deactivate_switch_to_idle_thread(), "State machine fail"); + + typename SIZE::ptr_t handoff_tid = event.arg2(); + // If the handoff tid and the event_tid are the same, the lookup will fail an assert due to timestamps going backwards. + MachineThread* handoff_thread = (handoff_tid == event.tid()) ? event_thread : process_event_tid_lookup(handoff_tid, now); + ASSERT(handoff_thread, "Sanity"); + + // If we marked a given thread as unrunnable in idle, or the MKRUNNABLE wasn't emitted, make sure we + // mark the thread as runnable now. + handoff_thread->make_runnable(now); + cpu.context_switch(handoff_thread, event_thread, now); + + if (!event_thread->is_idle_thread()) { + if (event_thread->tid() != event.arg2()) { + if ((event.arg1() & (AST_PREEMPT | AST_QUANTUM | AST_URGENT | AST_HANDOFF | AST_YIELD)) == 0) { + event_thread->make_unrunnable(now); + } + } + } + break; + } + + // + // There is a rare case of: + // + // event[795176] { timestamp=4b8074fa6bb5, arg1=0, arg2=0, arg3=0, arg4=0, tid=8ab77, end MP_CPU_DEACTIVATE, cpu=1 } + // event[795177] { timestamp=4b8074fa70bd, arg1=8ab77, arg2=ffffffffffffffff, arg3=0, arg4=4, tid=2d, --- MACH_SCHED_CHOOSE_PROCESSOR, cpu=1 } + // + // When a cpu shuts down via MP_CPU_DEACTIVATE, on reactivation, the cpu does a forced switch to its idle thread, + // without dropping a MACH_SCHED or MACH_STACK_HANDOFF. We want to catch this and update the cpu correctly, as + // well as marking the idle thread. + // + // This is a desktop only codepath, TRACE_MP_CPU_DEACTIVATE is defined in ./osfmk/i386/mp.c + // + case TRACE_MP_CPU_DEACTIVATE: + ASSERT(event_thread == cpu.thread() || !cpu.is_thread_state_initialized(), "Sanity"); + if (event.is_func_end()) { + cpu.set_deactivate_switch_to_idle_thread(); + } + break; + + case MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR): + // + // I have seen a sequence of events like this, where it appears that multiple threads get re-dispatched: + // + // event[254871] { timestamp=332dd22319b, arg1=0, arg2=0, arg3=0, arg4=0, tid=1b8ab, end MP_CPU_DEACTIVATE, cpu=7 } + // event[254876] { timestamp=332dd22387a, arg1=1b7d9, arg2=ffffffffffffffff, arg3=e, arg4=4, tid=1b8ab, --- MACH_SCHED_CHOOSE_PROCESSOR, cpu=7 } + // event[254877] { timestamp=332dd223c44, arg1=e, arg2=0, arg3=0, arg4=0, tid=1b8ab, --- MACH_SCHED_REMOTE_AST, cpu=7 } + // event[254887] { timestamp=332dd22441c, arg1=1b8ab, arg2=ffffffffffffffff, arg3=4, arg4=4, tid=53, --- MACH_SCHED_CHOOSE_PROCESSOR, cpu=7 } + // + // We will wait until we see a tid mismatch before clearing the deactivate_switch state + // + if (cpu.is_deactivate_switch_to_idle_thread()) { + if (cpu.thread() == NULL || event_thread->tid() != cpu.thread()->tid()) { + // The choose tracepoint has the tid of the thread on cpu when it deactivated. + ASSERT(cpu.thread() == NULL || cpu.thread()->tid() == event.arg1(), "Sanity"); + + cpu.clear_deactivate_switch_to_idle_thread(); + event_thread->set_is_idle_thread(); + event_thread->make_runnable(now); + cpu.context_switch(event_thread, cpu.thread(), now); + } + } + break; + + case MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE): + event_thread->make_runnable(now); + break; + + case MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE): + if (event.is_func_start()) { + cpu.set_idle(now); + } else { + cpu.clear_idle(now); + } + break; + + case MACHDBG_CODE(DBG_MACH_VM, 2 /* MACH_vmfault is hardcoded as 2 */): + if (event.is_func_start()) + event_thread->begin_vm_fault(now); + else + event_thread->end_vm_fault(now); + break; + + case BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM): + case BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT): + if (event.is_func_end()) { + if (pid_t pid = (pid_t)event.arg2()) { + // + // The time for this kill is already covered by the MEMSTAT_scan. + // We still want to mark the victim process as jetsam killed, though. + // We need to look up the victim, which is the pid in arg2. + // + if (MachineProcess* victim = youngest_mutable_process(pid)) { + ASSERT(!victim->is_exiting(), "Jetsam killing already dead process"); + // This isn't technically impossible, but as a practical matter it is more likely + // signalling a bug than we were able to wrap the pid counter and reuse this pid + ASSERT(!victim->is_kernel(), "Cannot jetsam kernel"); + victim->set_exit_by_jetsam(now); + } else { + ASSERT(false, "Unable to find jetsam victim pid"); + } + } + } + break; + + case BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN): + case BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE): + case BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE): + if (event.is_func_start()) + event_thread->begin_jetsam_activity(event.dbg_cooked(), now); + else + event_thread->end_jetsam_activity(event.dbg_cooked(), now); + break; + + // + // IMP_TASK_APPTYPE trace args are: + // + // start: + // target_pid, trequested_0, trequested_1, apptype + // end: + // target_pid, trequested_0, trequested_1, is_importance_receiver + case IMPORTANCE_CODE(IMP_TASK_APPTYPE, TASK_APPTYPE_NONE): + case IMPORTANCE_CODE(IMP_TASK_APPTYPE, TASK_APPTYPE_DAEMON_INTERACTIVE): + case IMPORTANCE_CODE(IMP_TASK_APPTYPE, TASK_APPTYPE_DAEMON_STANDARD): + case IMPORTANCE_CODE(IMP_TASK_APPTYPE, TASK_APPTYPE_DAEMON_ADAPTIVE): + case IMPORTANCE_CODE(IMP_TASK_APPTYPE, TASK_APPTYPE_DAEMON_BACKGROUND): + case IMPORTANCE_CODE(IMP_TASK_APPTYPE, TASK_APPTYPE_APP_DEFAULT): + case IMPORTANCE_CODE(IMP_TASK_APPTYPE, TASK_APPTYPE_APP_TAL): + // + // We want to set the explicit apptype now, and trequested will not have the + // apptype data until the end event. + // + if (event.is_func_start()) { + if (pid_t pid = (pid_t)event.arg1()) { + if (MachineProcess* target = youngest_mutable_process(pid)) { + target->set_apptype((uint32_t)event.arg4()); + } + } + } + process_trequested_task((pid_t)event.arg1(), event.arg2(), event.arg3()); + break; + + // Trace data is + // self_pid, audit_token_pid_from_task(task), trequested_0(task, NULL), trequested_1(task, NULL) + case IMPORTANCE_CODE(IMP_TASK_SUPPRESSION, 0): + case IMPORTANCE_CODE(IMP_TASK_SUPPRESSION, 1): + case IMPORTANCE_CODE(IMP_BOOST, IMP_BOOSTED): + case IMPORTANCE_CODE(IMP_BOOST, IMP_UNBOOSTED): + process_trequested_task((pid_t)event.arg2(), event.arg3(), event.arg4()); + break; + + case MACHDBG_CODE(DBG_MACH_IPC, MACH_THREAD_SET_VOUCHER): { + // + // This can be invoked against another thread; you must use arg1 as the tid. + // + // thread-tid, name, voucher, callsite-id + // + auto set_thread_tid = event.arg1(); + MachineThread* set_thread = (set_thread_tid == event.tid()) ? event_thread : process_event_tid_lookup(set_thread_tid, now); + set_thread->set_voucher(process_event_voucher_lookup(event.arg3(), UINT32_MAX), now); + break; + } + + case MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_VOUCHER_CREATE): + // trace event data is + // voucher address, voucher table size, system voucher count, voucher content bytes + create_voucher(event.arg1(), now, kMachineVoucherFlag::CreatedByVoucherCreate, (uint32_t)event.arg4()); + break; + + case MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_VOUCHER_DESTROY): + destroy_voucher(event.arg1(), now); + break; + + // The MachMsg state chart... + // + // The "key" to the mach msg is the kmsg_addr. + // + // We can encounter a given kmsg_addr in any of + // four possible states: + // + // UNINITIALIZED + // SEND + // RECV + // FREE + // + // These are the legal state transitions: + // (transition to UNINITIALIZED is not possible) + // + // UNIN -> SEND ; Accept as FREE -> SEND + // UNIN -> RECV ; Accept as SEND -> RECV + // UNIN -> FREE ; Accept as FREE -> FREE + // + // SEND -> SEND ; ERROR! + // SEND -> RECV ; User to User IPC, send message to machine + // SEND -> FREE ; User to Kernel IPC, recycle. + // + // RECV -> SEND ; ERROR! + // RECV -> RECV ; ERROR! + // RECV -> FREE ; End User IPC + // + // FREE -> SEND ; Begin User IPC + // FREE -> RECV ; Kernel to User IPC + // FREE -> FREE ; Kernel to Kernel IPC + + case MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_MSG_SEND): { + // trace event data is: + // kmsg_addr, msgh_bits, msgh_id, voucher_addr, + auto kmsg_addr = event.arg1(); + auto msgh_bits = (uint32_t)event.arg2(); + auto msgh_id = (uint32_t)event.arg3(); + auto voucher_addr = event.arg4(); + + auto nursery_it = _mach_msg_nursery.find(kmsg_addr); + if (nursery_it == _mach_msg_nursery.end()) { + nursery_it = _mach_msg_nursery.emplace(kmsg_addr, kmsg_addr).first; + } + + auto& nursery_msg = nursery_it->second; + + switch (nursery_msg.state()) { + // SEND -> SEND ; ERROR! + // RECV -> SEND ; ERROR! + case kNurseryMachMsgState::Send: + ASSERT(false, "illegal state transition (SEND -> SEND) in nursery mach msg"); + case kNurseryMachMsgState::Recv: + ASSERT(false, "illegal state transition (RECV -> SEND) in nursery mach msg"); + break; + + // UNIN -> SEND ; Accept as FREE -> SEND + // FREE -> SEND ; Begin User IPC + case kNurseryMachMsgState::Uninitialized: + case kNurseryMachMsgState::Free: { + uintptr_t event_index = &event - _events; + nursery_msg.send(event_index, event.timestamp(), event.tid(), kmsg_addr, msgh_bits, msgh_id, process_event_voucher_lookup(voucher_addr, msgh_bits)); + break; + } + } + // We do the state set here so that release builds + // sync to current state when errors are encountered + nursery_msg.set_state(kNurseryMachMsgState::Send); + break; + } + + case MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_MSG_RECV): + case MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_MSG_RECV_VOUCHER_REFUSED): + { + // trace event data is + // kmsg_addr, msgh_bits, msgh_id, voucher_addr + auto kmsg_addr = event.arg1(); + auto msgh_bits = (uint32_t)event.arg2(); + auto voucher_addr = event.arg4(); + + auto nursery_it = _mach_msg_nursery.find(kmsg_addr); + if (nursery_it == _mach_msg_nursery.end()) { + nursery_it = _mach_msg_nursery.emplace(kmsg_addr, kmsg_addr).first; + } + + auto& nursery_msg = nursery_it->second; + + uint32_t flags = (event.dbg_code() == MACH_IPC_MSG_RECV_VOUCHER_REFUSED) ? kMachineMachMsgFlag::IsVoucherRefused : 0; + uintptr_t event_index = &event - _events; + + switch (nursery_msg.state()) { + + // UNIN -> RECV ; Accept as SEND -> RECV + case kNurseryMachMsgState::Uninitialized: { + flags |= kMachineMachMsgFlag::HasReceiver; + + auto mach_msg_it = _mach_msgs.emplace(_mach_msgs.end(), + NurseryMachMsg::message_id(), + kmsg_addr, + flags, + AbsTime(0), + 0, + 0, + &Machine::UnsetVoucher, + now, + event.tid(), + msgh_bits, + process_event_voucher_lookup(voucher_addr, msgh_bits)); + + ASSERT(_mach_msgs_by_event_index.find(event_index) == _mach_msgs_by_event_index.end(), "Stomping mach msg"); + _mach_msgs_by_event_index[event_index] = std::distance(_mach_msgs.begin(), mach_msg_it); + break; + } + + // SEND -> RECV ; User to User IPC, send message to machine + case kNurseryMachMsgState::Send: { + ASSERT(kmsg_addr == nursery_msg.kmsg_addr(), "Sanity"); + ASSERT((uint32_t)event.arg3() == nursery_msg.send_msgh_id(), "Sanity"); + + flags |= (kMachineMachMsgFlag::HasSender | kMachineMachMsgFlag::HasReceiver); + + auto mach_msg_it = _mach_msgs.emplace(_mach_msgs.end(), + nursery_msg.id(), + kmsg_addr, + flags, + nursery_msg.send_time(), + nursery_msg.send_tid(), + nursery_msg.send_msgh_bits(), + nursery_msg.send_voucher(), + now, + event.tid(), + msgh_bits, + process_event_voucher_lookup(voucher_addr, msgh_bits)); + + intptr_t send_event_index = nursery_msg.send_event_index(); + if (send_event_index != -1) { + ASSERT(send_event_index < _event_count, "Sanity"); + ASSERT(_mach_msgs_by_event_index.find(event_index) == _mach_msgs_by_event_index.end(), "Stomping mach msg"); + _mach_msgs_by_event_index[send_event_index] = std::distance(_mach_msgs.begin(), mach_msg_it); + } + ASSERT(_mach_msgs_by_event_index.find(event_index) == _mach_msgs_by_event_index.end(), "Stomping mach msg"); + _mach_msgs_by_event_index[event_index] = std::distance(_mach_msgs.begin(), mach_msg_it); + break; + } + + // RECV -> RECV ; ERROR! + case kNurseryMachMsgState::Recv: + ASSERT(false, "illegal state transition (RECV -> RECV) in nursery mach msg"); + break; + + // FREE -> RECV ; Kernel to User IPC + case kNurseryMachMsgState::Free: + break; + } + + // We do the state set here so that release builds + // sync to current state when errors are encountered + nursery_msg.set_state(kNurseryMachMsgState::Recv); + break; + } + + case MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_FREE): { + // trace event data is: + // kmsg_addr + + auto kmsg_addr = event.arg1(); + + auto nursery_it = _mach_msg_nursery.find(kmsg_addr); + if (nursery_it == _mach_msg_nursery.end()) { + nursery_it = _mach_msg_nursery.emplace(kmsg_addr, kmsg_addr).first; + } + + auto& nursery_msg = nursery_it->second; + + + // All transitions to FREE are legal. + nursery_msg.set_state(kNurseryMachMsgState::Free); + } + + default: + // IO Activity + // + // There isn't an easy way to handle these inside the switch, The + // code is used as a bitfield. + + + // + // Okay temp note on how to approach this. + // + // Even a single thread may have overlapping IO activity. + // None of the current scheme's handle overlapped activity well. + // + // We'd like to be able to show for any given interval, "X pages IO outstanding, Y pages completed, Z ms waiting" + // + // To do that, we've got to be able to intersect an arbitrary interval with a pile of overlapping intervals. + // + // The approach is to accumulate the IO activity into a single vector. + // Sort by interval.location(). + // Now flatten this interval (union flatten). + // This will produce a second vector of non-overlapping intervals. + // When we want to intersect the arbitrary interval, we do the standard search on the non overlapping interval vector. + // This will give us a starting and ending location that guarantee to cover every IO that might intersect. + // + // The assumption is that while IO's overlap, they don't stay active forever. Sooner or later there will be a break. + // + // The arch-nemesis of this scheme is the light overlap, like so: + // + // XXXXX XXXXXXXXXXX XXXXXXXXXXXXXXXXXXXX + // XXXXXXX XXXXXXXXXXXXXXXX XXXXXXXXXXXXXX + + + // + // It turns out that IO can overlap inside a single thread, for example: + // + // 437719 C73AD5945 --- P_RdDataAsync 209b9f07 1000002 6b647 5000 2A72 1 gamed 293 + // 437724 C73AD5DCA --- P_RdDataAsync 209b7e37 1000002 6b64c 6000 2A72 1 gamed 293 + // 437822 C73AD98B0 --- P_RdDataAsyncDone 209b7e37 4dfe3eef 0 0 191 1 kernel_task 0 + // 437829 C73AD9E55 --- P_RdDataAsyncDone 209b9f07 4dfe3eef 0 0 191 1 kernel_task 0 + // + + if (event.dbg_class() == DBG_FSYSTEM && event.dbg_subclass() == DBG_DKRW) { + uint32_t code = event.dbg_code(); + // + // Disk IO doesn't use func_start/func_end + // + // arg1 == uid + // arg4 == size + if (code & DKIO_DONE) { + this->end_io(now, event.arg1()); + } else { + + // IO is initiated by a given process/thread, but it always finishes on a kernel_thread. + // We need to stash enough data to credit the correct thread when the completion event arrives. + begin_io(event_thread, now, event.arg1(), event.arg4()); + } + } else if (event.dbg_class() == DBG_IMPORTANCE) { + // + // Every task policy set trace code carries "trequested" data, we would like to grab them all. + // + // This subclass spans the range of 0x20 through 0x3F + // + + uint32_t subclass = event.dbg_subclass(); + if (subclass >= 0x20 && subclass <= 0x3F) { + // Trace event data is + // targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), value + + bool is_task_event = (event.dbg_code() & TASK_POLICY_TASK) > 0; + + // Should not be both a task and thread event. + ASSERT(is_task_event != (event.dbg_code() & TASK_POLICY_THREAD), "BEWM!"); + + if (is_task_event) { + process_trequested_task((pid_t)event.arg1(), event.arg2(), event.arg3()); + } else { + process_trequested_thread(event.arg1(), event.arg2(), event.arg3()); + } + } + } + break; + } + } + + return true; +} + +template +void Machine::initialize_cpu_idle_intr_states() { + ASSERT(_event_count, "Sanity"); + ASSERT(_events, "Sanity"); + ASSERT(!_cpus.empty(), "Sanity"); + + // How much work do we need to do? + uint32_t inits_needed = 0; + uint32_t inits_done = 0; + for (auto& cpu : _cpus) { + if (!cpu.is_iop()) { + inits_needed += 3; + + if (cpu.is_intr_state_initialized()) { + inits_done++; + } + if (cpu.is_idle_state_initialized()) { + inits_done++; + } + if (cpu.is_thread_state_initialized()) { + inits_done++; + } + } + } + + uintptr_t index; + for (index = 0; index < _event_count; ++index) { + const KDEvent& event = _events[index]; + ASSERT(event.cpu() > -1 && event.cpu() < _cpus.size(), "cpu_id out of range"); + MachineCPU& cpu = _cpus[event.cpu()]; + + if (!cpu.is_iop()) { + switch (event.dbg_cooked()) { + case TRACE_LOST_EVENTS: + // We're done, give up. + return; + + case MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0): + if (!cpu.is_intr_state_initialized()) { + inits_done++; + + if (event.is_func_start()) { + // If we are starting an INTR now, the cpu was not in INTR prior to now. + cpu.initialize_intr_state(false, _events[0].timestamp()); + } else { + // If we are ending an INTR now, the cpu was in INTR prior to now. + cpu.initialize_intr_state(true, _events[0].timestamp()); + } + } + break; + + case MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE): + if (!cpu.is_idle_state_initialized()) { + inits_done++; + + if (event.is_func_start()) { + // If we are starting Idle now, the cpu was not Idle prior to now. + cpu.initialize_idle_state(false, _events[0].timestamp()); + } else { + // If we are ending Idle now, the cpu was Idle prior to now. + cpu.initialize_idle_state(true, _events[0].timestamp()); + } + } + break; + + // I spent a day tracking this down.... + // + // When you are actively sampling (say, every 100ms) on a machine + // that is mostly idle, there will be long periods of VERY idle + // cpus. So you might get a sample with no begin/end idle at all, + // but the cpu is actually idle the entire time. Now suppose in + // the next sample, you get a simple idle timeout in the middle, + // and immdiately go back to idle. If we treat any TID found on + // cpu as "running", we blow up because this segment appears to + // have the idle thread "running". + // + // So, to do a proper thread init, we require actual scheduler + // activity to tell us who the thread was. + case MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED): + case MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF): + if (!cpu.is_thread_state_initialized()) { + inits_done++; + + // We want to use the thread that *was* on cpu, not the thread being + // handed off too. + MachineThread* init_thread = youngest_mutable_thread(event.tid()); + // Legal for this to be NULL! + cpu.initialize_thread_state(init_thread, _events[0].timestamp()); + + } + break; + } + } + + if (inits_done == inits_needed) { + break; + } + } +} + +template +void Machine::begin_io(MachineThread* thread, AbsTime begin_time, typename SIZE::ptr_t uid, typename SIZE::ptr_t size) { + auto it = _io_by_uid.find(uid); + if (it == _io_by_uid.end()) { + _io_by_uid.emplace(uid, IOActivity(begin_time, AbsTime(0), thread, size)); + } else { + // We shouldn't find a valid IO entry at the uid we're installing. + ASSERT(it->second.thread() == NULL, "Overwriting existing io entry"); + ASSERT(it->second.location() == 0, "Overwriting existing io entry"); + + it->second = IOActivity(begin_time, AbsTime(0), thread, size); + } +} + +template +void Machine::end_io(AbsTime end_time, typename SIZE::ptr_t uid) { + auto it = _io_by_uid.find(uid); + + // Its okay to not find a match, if a trace begins with a Done event, for example. + if (it != _io_by_uid.end()) { + MachineThread* io_thread = it->second.thread(); + AbsTime begin_time = it->second.location(); + ASSERT(end_time > it->second.location(), "Sanity"); + + _all_io.emplace_back(begin_time, end_time - begin_time, io_thread, it->second.size()); + + DEBUG_ONLY({ + it->second.set_thread(NULL); + it->second.set_location(AbsTime(0)); + it->second.set_length(AbsTime(0)); + }) + } +}