xnu-3789.1.32.tar.gz

[apple/xnu.git] / osfmk / kern / thread.c
diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c

index cc8e391b180d9bdde6bf6212e46d16863f7fa65a..5a703f62ffe173058be50cae4c6cef98d9772be2 100644 (file)
--- a/osfmk/kern/thread.c
+++ b/osfmk/kern/thread.c
@@ -117,6 +117,8 @@
  #include <kern/assert.h>
  #include <kern/exc_resource.h>
  #include <kern/telemetry.h>
+#include <kern/policy_internal.h>
+
  #include <corpses/task_corpse.h>
  #if KPC
  #include <kern/kpc.h>
@@ -141,6 +143,7 @@
  #include <mach/mach_host_server.h>
  #include <mach/host_priv_server.h>
  #include <mach/mach_voucher_server.h>
+#include <kern/policy_internal.h>
  
  static struct zone                     *thread_zone;
  static lck_grp_attr_t          thread_lck_grp_attr;
@@ -157,6 +160,15 @@ static queue_head_t                thread_terminate_queue;
  
  static queue_head_t            crashed_threads_queue;
  
+decl_simple_lock_data(static,thread_exception_lock)
+static queue_head_t            thread_exception_queue;
+
+struct thread_exception_elt {
+       queue_chain_t   elt;
+       task_t          exception_task;
+       thread_t        exception_thread;
+};
+
  static struct thread   thread_template, init_thread;
  
  static void            sched_call_null(
@@ -165,6 +177,7 @@ static void         sched_call_null(
  
  #ifdef MACH_BSD
  extern void proc_exit(void *);
+extern mach_exception_data_type_t proc_encode_exit_exception_code(void *);
  extern uint64_t get_dispatchqueue_offset_from_proc(void *);
  extern int      proc_selfpid(void);
  extern char *   proc_name_address(void *p);
@@ -180,8 +193,7 @@ static uint64_t             thread_unique_id = 100;
  
  struct _thread_ledger_indices thread_ledgers = { -1 };
  static ledger_template_t thread_ledger_template = NULL;
-void init_thread_ledgers(void);
-int task_disable_cpumon(task_t task);
+static void init_thread_ledgers(void);
  
  #if CONFIG_JETSAM
  void jetsam_on_ledger_cpulimit_exceeded(void);
@@ -196,7 +208,7 @@ void jetsam_on_ledger_cpulimit_exceeded(void);
  #define        CPUMON_USTACKSHOTS_TRIGGER_DEFAULT_PCT 70
  
  int cpumon_ustackshots_trigger_pct; /* Percentage. Level at which we start gathering telemetry. */
-void __attribute__((noinline)) THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(void);
+void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void);
  
  /*
   * The smallest interval over which we support limiting CPU consumption is 1ms
@@ -210,6 +222,10 @@ thread_bootstrap(void)
          *      Fill in a template thread for fast initialization.
          */
  
+#if MACH_ASSERT
+       thread_template.thread_magic = THREAD_MAGIC;
+#endif /* MACH_ASSERT */
+
         thread_template.runq = PROCESSOR_NULL;
  
         thread_template.ref_count = 2;
@@ -230,6 +246,7 @@ thread_bootstrap(void)
         thread_template.sched_flags = 0;
         thread_template.saved_mode = TH_MODE_NONE;
         thread_template.safe_release = 0;
+       thread_template.th_sched_bucket = TH_BUCKET_RUN;
  
         thread_template.sfi_class = SFI_CLASS_UNSPECIFIED;
         thread_template.sfi_wait_class = SFI_CLASS_UNSPECIFIED;
@@ -249,10 +266,6 @@ thread_bootstrap(void)
         thread_template.pending_promoter[1] = NULL;
         thread_template.rwlock_count = 0;
  
-#if MACH_ASSERT
-       thread_template.SHARE_COUNT = 0;
-       thread_template.BG_COUNT = 0;
-#endif /* MACH_ASSERT */       
  
         thread_template.realtime.deadline = UINT64_MAX;
  
@@ -283,6 +296,7 @@ thread_bootstrap(void)
         thread_template.vtimer_user_save = 0;
         thread_template.vtimer_prof_save = 0;
         thread_template.vtimer_rlim_save = 0;
+       thread_template.vtimer_qos_save  = 0;
  
  #if CONFIG_SCHED_SFI
         thread_template.wait_sfi_begin_time = 0;
@@ -303,6 +317,13 @@ thread_bootstrap(void)
         thread_template.t_dtrace_tracing = 0;
  #endif /* CONFIG_DTRACE */
  
+#if KPERF
+       thread_template.kperf_flags = 0;
+       thread_template.kperf_pet_gen = 0;
+       thread_template.kperf_c_switch = 0;
+       thread_template.kperf_pet_cnt = 0;
+#endif
+
  #if KPC
         thread_template.kpc_buf = NULL;
  #endif
@@ -311,8 +332,6 @@ thread_bootstrap(void)
         thread_template.hv_thread_target = NULL;
  #endif /* HYPERVISOR */
  
-       thread_template.t_chud = 0;
-
  #if (DEVELOPMENT || DEBUG)
         thread_template.t_page_creation_throttled_hard = 0;
         thread_template.t_page_creation_throttled_soft = 0;
@@ -333,9 +352,8 @@ thread_bootstrap(void)
         thread_template.t_deduct_bank_ledger_time = 0;
  #endif
  
-       thread_template.requested_policy = default_task_requested_policy;
-       thread_template.effective_policy = default_task_effective_policy;
-       thread_template.pended_policy    = default_task_pended_policy;
+       thread_template.requested_policy = (struct thread_requested_policy) {};
+       thread_template.effective_policy = (struct thread_effective_policy) {};
  
         bzero(&thread_template.overrides, sizeof(thread_template.overrides));
  
@@ -381,9 +399,11 @@ thread_init(void)
         lck_grp_attr_setdefault(&thread_lck_grp_attr);
         lck_grp_init(&thread_lck_grp, "thread", &thread_lck_grp_attr);
         lck_attr_setdefault(&thread_lck_attr);
-       
+
         stack_init();
  
+       thread_policy_init();
+
         /*
          *      Initialize any machine-dependent
          *      per-thread structures necessary.
@@ -400,6 +420,19 @@ thread_init(void)
         init_thread_ledgers();
  }
  
+void
+thread_corpse_continue(void)
+{
+       thread_t thread = current_thread();
+
+       thread_terminate_internal(thread);
+       ml_set_interrupts_enabled(FALSE);
+       ast_taken(AST_APC, TRUE);
+
+       panic("thread_corpse_continue");
+       /*NOTREACHED*/
+}
+
  static void
  thread_terminate_continue(void)
  {
@@ -425,14 +458,12 @@ thread_terminate_self(void)
         thread_mtx_lock(thread);
  
         ipc_thread_disable(thread);
-       
+
         thread_mtx_unlock(thread);
  
         s = splsched();
         thread_lock(thread);
  
-       assert_thread_sched_count(thread);
-
         /*
          *      Cancel priority depression, wait for concurrent expirations
          *      on other processors.
@@ -470,14 +501,37 @@ thread_terminate_self(void)
         thread_mtx_unlock(thread);
  
         task = thread->task;
-       uthread_cleanup(task, thread->uthread, task->bsd_info, thread->inspection == 1 ? TRUE : FALSE);
+       uthread_cleanup(task, thread->uthread, task->bsd_info);
         threadcnt = hw_atomic_sub(&task->active_thread_count, 1);
  
+       if (task->bsd_info) {
+               /* trace out pid before we sign off */
+               long    dbg_arg1 = 0;
+
+               kdbg_trace_data(thread->task->bsd_info, &dbg_arg1);
+
+               KERNEL_DEBUG_CONSTANT(TRACE_DATA_THREAD_TERMINATE_PID | DBG_FUNC_NONE,
+                       dbg_arg1, 0, 0, 0, 0);
+       }
+
         /*
          * If we are the last thread to terminate and the task is
          * associated with a BSD process, perform BSD process exit.
          */
         if (threadcnt == 0 && task->bsd_info != NULL) {
+               mach_exception_data_type_t subcode = 0;
+               {
+                       /* since we're the last thread in this process, trace out the command name too */
+                       long    dbg_arg1 = 0, dbg_arg2 = 0, dbg_arg3 = 0, dbg_arg4 = 0;
+
+                       kdbg_trace_string(thread->task->bsd_info, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4);
+
+                       KERNEL_DEBUG_CONSTANT(TRACE_STRING_PROC_EXIT | DBG_FUNC_NONE,
+                               dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0);
+               }
+
+               /* Get the exit reason before proc_exit */
+               subcode = proc_encode_exit_exception_code(task->bsd_info);
                 proc_exit(task->bsd_info);
                 /*
                  * if there is crash info in task
@@ -485,9 +539,18 @@ thread_terminate_self(void)
                  * last thread for this task.
                  */
                 if (task->corpse_info) {
-                       task_deliver_crash_notification(task);
+                       task_deliver_crash_notification(task, current_thread(), subcode);
+               }
+       }
+
+       if (threadcnt == 0) {
+               task_lock(task);
+               if (task_is_a_corpse_fork(task)) {
+                       thread_wakeup((event_t)&task->active_thread_count);
                 }
+               task_unlock(task);
         }
+
         uthread_cred_free(thread->uthread);
  
         s = splsched();
@@ -542,8 +605,12 @@ thread_terminate_self(void)
  void
  thread_deallocate_safe(thread_t thread)
  {
-       if (__improbable(hw_atomic_sub(&(thread)->ref_count, 1) == 0))
-               panic("bad thread refcount!");
+       assert_thread_magic(thread);
+
+       uint32_t old_refcount = hw_atomic_sub(&(thread)->ref_count, 1) + 1;
+
+       if (__improbable(old_refcount <= 1))
+               panic("bad thread refcount: %d", old_refcount);
  }
  
  void
@@ -555,6 +622,9 @@ thread_deallocate(
         if (thread == THREAD_NULL)
                 return;
  
+       assert_thread_magic(thread);
+       assert(thread->ref_count > 0);
+
         if (__probable(hw_atomic_sub(&(thread)->ref_count, 1) > 0))
                 return;
  
@@ -563,6 +633,8 @@ thread_deallocate(
  
         assert(thread->runq == PROCESSOR_NULL);
  
+       assert(thread->user_promotions == 0);
+
  #if KPC
         kpc_thread_destroy(thread);
  #endif
@@ -601,9 +673,107 @@ thread_deallocate(
  
         task_deallocate(task);
  
+#if MACH_ASSERT
+       assert_thread_magic(thread);
+       thread->thread_magic = 0;
+#endif /* MACH_ASSERT */
+
         zfree(thread_zone, thread);
  }
  
+/*
+ *     thread_exception_daemon:
+ *
+ *     Deliver EXC_RESOURCE exception
+ */
+static void
+thread_exception_daemon(void)
+{
+       struct thread_exception_elt *elt;
+       task_t task;
+       thread_t thread;
+
+       simple_lock(&thread_exception_lock);
+       while ((elt = (struct thread_exception_elt *)dequeue_head(&thread_exception_queue)) != NULL) {
+               simple_unlock(&thread_exception_lock);
+
+               task = elt->exception_task;
+               thread = elt->exception_thread;
+               assert_thread_magic(thread);
+
+               kfree(elt, sizeof(struct thread_exception_elt));
+
+               /* wait for all the threads in the task to terminate */
+               task_lock(task);
+               task_wait_till_threads_terminate_locked(task);
+               task_unlock(task);
+
+               /* Consumes the task ref returned by task_generate_corpse_internal */
+               task_deallocate(task);
+               /* Consumes the thread ref returned by task_generate_corpse_internal */
+               thread_deallocate(thread);
+
+               /* Deliver the EXC_RESOURCE notification, also clears the corpse. */
+               task_deliver_crash_notification(task, thread, 0);
+
+               simple_lock(&thread_exception_lock);
+       }
+
+       assert_wait((event_t)&thread_exception_queue, THREAD_UNINT);
+       simple_unlock(&thread_exception_lock);
+
+       thread_block((thread_continue_t)thread_exception_daemon);
+}
+
+/*
+ *     thread_exception_enqueue:
+ *
+ *     Enqueue a corpse port to be delivered an EXC_RESOURCE.
+ */
+void
+thread_exception_enqueue(
+       task_t          task,
+       thread_t        thread)
+{
+       struct thread_exception_elt *elt = (struct thread_exception_elt*) kalloc(
+                                               sizeof(struct thread_exception_elt));
+
+       elt->exception_task = task;
+       elt->exception_thread = thread;
+
+       simple_lock(&thread_exception_lock);
+       enqueue_tail(&thread_exception_queue, (queue_entry_t)elt);
+       simple_unlock(&thread_exception_lock);
+
+       thread_wakeup((event_t)&thread_exception_queue);
+}
+
+/*
+ *     thread_copy_resource_info
+ *
+ *     Copy the resource info counters from source
+ *     thread to destination thread.
+ */
+void
+thread_copy_resource_info(
+       thread_t dst_thread,
+       thread_t src_thread)
+{
+       dst_thread->thread_tag = src_thread->thread_tag;
+       dst_thread->c_switch = src_thread->c_switch;
+       dst_thread->p_switch = src_thread->p_switch;
+       dst_thread->ps_switch = src_thread->ps_switch;
+       dst_thread->precise_user_kernel_time = src_thread->precise_user_kernel_time;
+       dst_thread->user_timer = src_thread->user_timer;
+       dst_thread->user_timer_save = src_thread->user_timer_save;
+       dst_thread->system_timer_save = src_thread->system_timer_save;
+       dst_thread->syscalls_unix = src_thread->syscalls_unix;
+       dst_thread->syscalls_mach = src_thread->syscalls_mach;
+       ledger_rollup(dst_thread->t_threadledger, src_thread->t_threadledger);
+       *dst_thread->thread_io_stats = *src_thread->thread_io_stats;
+
+}
+
  /*
   *     thread_terminate_daemon:
   *
@@ -621,7 +791,8 @@ thread_terminate_daemon(void)
         (void)splsched();
         simple_lock(&thread_terminate_lock);
  
-       while ((thread = (thread_t)dequeue_head(&thread_terminate_queue)) != THREAD_NULL) {
+       while ((thread = qe_dequeue_head(&thread_terminate_queue, struct thread, runq_links)) != THREAD_NULL) {
+               assert_thread_magic(thread);
  
                 /* 
                  * if marked for crash reporting, skip reaping. 
@@ -629,16 +800,13 @@ thread_terminate_daemon(void)
                  * for reaping when done
                  */
                 if (thread->inspection){
-                       enqueue_tail(&crashed_threads_queue, (queue_entry_t)thread);
+                       enqueue_tail(&crashed_threads_queue, &thread->runq_links);
                         continue;
                 }
  
                 simple_unlock(&thread_terminate_lock);
                 (void)spllo();
  
-               assert(thread->SHARE_COUNT == 0);
-               assert(thread->BG_COUNT == 0);
-
                 task = thread->task;
  
                 task_lock(task);
@@ -659,8 +827,10 @@ thread_terminate_daemon(void)
                 task->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1;
                 task->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2;
                 task->task_gpu_ns += ml_gpu_stat(thread);
-               
-               thread_update_qos_cpu_time(thread, FALSE);
+               task->task_energy += ml_energy_stat(thread);
+
+               thread_update_qos_cpu_time(thread);
+
                 queue_remove(&task->threads, thread, thread_t, task_threads);
                 task->thread_count--;
  
@@ -707,7 +877,7 @@ thread_terminate_enqueue(
         KERNEL_DEBUG_CONSTANT(TRACE_DATA_THREAD_TERMINATE | DBG_FUNC_NONE, thread->thread_id, 0, 0, 0, 0);
  
         simple_lock(&thread_terminate_lock);
-       enqueue_tail(&thread_terminate_queue, (queue_entry_t)thread);
+       enqueue_tail(&thread_terminate_queue, &thread->runq_links);
         simple_unlock(&thread_terminate_lock);
  
         thread_wakeup((event_t)&thread_terminate_queue);
@@ -715,13 +885,13 @@ thread_terminate_enqueue(
  
  /*
   * thread_terminate_crashed_threads:
- * walk the list of crashed therds and put back set of threads
+ * walk the list of crashed threads and put back set of threads
   * who are no longer being inspected.
   */
  void
  thread_terminate_crashed_threads()
  {
-       thread_t th_iter, th_remove;
+       thread_t th_remove;
         boolean_t should_wake_terminate_queue = FALSE;
  
         simple_lock(&thread_terminate_lock);
@@ -729,16 +899,13 @@ thread_terminate_crashed_threads()
          * loop through the crashed threads queue
          * to put any threads that are not being inspected anymore
          */
-       th_iter = (thread_t)queue_first(&crashed_threads_queue);
-       while (!queue_end(&crashed_threads_queue, (queue_entry_t)th_iter)) {
-               th_remove = th_iter;
-               th_iter = (thread_t)queue_next(&th_iter->links);
  
+       qe_foreach_element_safe(th_remove, &crashed_threads_queue, runq_links) {
                 /* make sure current_thread is never in crashed queue */
                 assert(th_remove != current_thread());
-               if (th_remove->inspection != TRUE){
-                       remque((queue_entry_t)th_remove);
-                       enqueue_tail(&thread_terminate_queue, (queue_entry_t)th_remove);
+
+               if (th_remove->inspection == FALSE) {
+                       re_queue_tail(&thread_terminate_queue, &th_remove->runq_links);
                         should_wake_terminate_queue = TRUE;
                 }
         }
@@ -764,7 +931,9 @@ thread_stack_daemon(void)
         s = splsched();
         simple_lock(&thread_stack_lock);
  
-       while ((thread = (thread_t)dequeue_head(&thread_stack_queue)) != THREAD_NULL) {
+       while ((thread = qe_dequeue_head(&thread_stack_queue, struct thread, runq_links)) != THREAD_NULL) {
+               assert_thread_magic(thread);
+
                 simple_unlock(&thread_stack_lock);
                 splx(s);
  
@@ -801,9 +970,10 @@ thread_stack_enqueue(
         thread_t                thread)
  {
         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_STACK_WAIT) | DBG_FUNC_START, thread_tid(thread), 0, 0, 0, 0);
+       assert_thread_magic(thread);
  
         simple_lock(&thread_stack_lock);
-       enqueue_tail(&thread_stack_queue, (queue_entry_t)thread);
+       enqueue_tail(&thread_stack_queue, &thread->runq_links);
         simple_unlock(&thread_stack_lock);
  
         thread_wakeup((event_t)&thread_stack_queue);
@@ -833,11 +1003,21 @@ thread_daemon_init(void)
                 panic("thread_daemon_init: thread_stack_daemon");
  
         thread_deallocate(thread);
+
+       simple_lock_init(&thread_exception_lock, 0);
+       queue_init(&thread_exception_queue);
+
+       result = kernel_thread_start_priority((thread_continue_t)thread_exception_daemon, NULL, MINPRI_KERNEL, &thread);
+       if (result != KERN_SUCCESS)
+               panic("thread_daemon_init: thread_exception_daemon");
+
+       thread_deallocate(thread);
  }
  
  #define TH_OPTION_NONE         0x00
  #define TH_OPTION_NOCRED       0x01
  #define TH_OPTION_NOSUSP       0x02
+
  /*
   * Create a new thread.
   * Doesn't start the thread running.
@@ -871,6 +1051,10 @@ thread_create_internal(
  #ifdef MACH_BSD
         new_thread->uthread = uthread_alloc(parent_task, new_thread, (options & TH_OPTION_NOCRED) != 0);
         if (new_thread->uthread == NULL) {
+#if MACH_ASSERT
+               new_thread->thread_magic = 0;
+#endif /* MACH_ASSERT */
+
                 zfree(thread_zone, new_thread);
                 return (KERN_RESOURCE_SHORTAGE);
         }
@@ -882,11 +1066,15 @@ thread_create_internal(
  
                 new_thread->uthread = NULL;
                 /* cred free may not be necessary */
-               uthread_cleanup(parent_task, ut, parent_task->bsd_info, FALSE);
+               uthread_cleanup(parent_task, ut, parent_task->bsd_info);
                 uthread_cred_free(ut);
                 uthread_zone_free(ut);
  #endif  /* MACH_BSD */
  
+#if MACH_ASSERT
+               new_thread->thread_magic = 0;
+#endif /* MACH_ASSERT */
+
                 zfree(thread_zone, new_thread);
                 return (KERN_FAILURE);
         }
@@ -915,11 +1103,13 @@ thread_create_internal(
         lck_mtx_lock(&tasks_threads_lock);
         task_lock(parent_task);
  
-       if (    !parent_task->active || parent_task->halting ||
-                       ((options & TH_OPTION_NOSUSP) != 0 &&
-                               parent_task->suspend_count > 0) ||
-                       (parent_task->thread_count >= task_threadmax &&
-                               parent_task != kernel_task)             ) {
+       /*
+        * Fail thread creation if parent task is being torn down or has too many threads
+        * If the caller asked for TH_OPTION_NOSUSP, also fail if the parent task is suspended
+        */
+       if (parent_task->active == 0 || parent_task->halting ||
+           (parent_task->suspend_count > 0 && (options & TH_OPTION_NOSUSP) != 0) ||
+           (parent_task->thread_count >= task_threadmax && parent_task != kernel_task)) {
                 task_unlock(parent_task);
                 lck_mtx_unlock(&tasks_threads_lock);
  
@@ -928,7 +1118,7 @@ thread_create_internal(
                         void *ut = new_thread->uthread;
  
                         new_thread->uthread = NULL;
-                       uthread_cleanup(parent_task, ut, parent_task->bsd_info, FALSE);
+                       uthread_cleanup(parent_task, ut, parent_task->bsd_info);
                         /* cred free may not be necessary */
                         uthread_cred_free(ut);
                         uthread_zone_free(ut);
@@ -963,7 +1153,6 @@ thread_create_internal(
                 ledger_entry_setactive(new_thread->t_threadledger, thread_ledgers.cpu_time);
         }
  
-       new_thread->cpu_time_last_qos = 0;
  #ifdef CONFIG_BANK
         new_thread->t_bankledger = LEDGER_NULL;
         new_thread->t_deduct_bank_ledger_time = 0;
@@ -987,19 +1176,9 @@ thread_create_internal(
  #if KPC
         kpc_thread_create(new_thread);
  #endif
-       
-       /* Only need to update policies pushed from task to thread */
-       new_thread->requested_policy.bg_iotier  = parent_task->effective_policy.bg_iotier;
-       new_thread->requested_policy.terminated = parent_task->effective_policy.terminated;
  
         /* Set the thread's scheduling parameters */
-#if defined(CONFIG_SCHED_TIMESHARE_CORE)
-       new_thread->sched_stamp = sched_tick;
-       new_thread->pri_shift = sched_pri_shift;
-#endif /* defined(CONFIG_SCHED_TIMESHARE_CORE) */
-
         new_thread->sched_mode = SCHED(initial_thread_sched_mode)(parent_task);
-       new_thread->sched_flags = 0;
         new_thread->max_priority = parent_task->max_priority;
         new_thread->task_priority = parent_task->priority;
  
@@ -1009,10 +1188,14 @@ thread_create_internal(
                 new_priority = new_thread->max_priority;
  
         new_thread->importance = new_priority - new_thread->task_priority;
-       new_thread->saved_importance = new_thread->importance;
  
         sched_set_thread_base_priority(new_thread, new_priority);
  
+#if defined(CONFIG_SCHED_TIMESHARE_CORE)
+       new_thread->sched_stamp = sched_tick;
+       new_thread->pri_shift = sched_pri_shifts[new_thread->th_sched_bucket];
+#endif /* defined(CONFIG_SCHED_TIMESHARE_CORE) */
+
  
         thread_policy_create(new_thread);
  
@@ -1030,7 +1213,13 @@ thread_create_internal(
         threads_count++;
  
         new_thread->active = TRUE;
-       new_thread->inspection = FALSE;
+       if (task_is_a_corpse_fork(parent_task)) {
+               /* Set the inspection bit if the task is a corpse fork */
+               new_thread->inspection = TRUE;
+       } else {
+               new_thread->inspection = FALSE;
+       }
+       new_thread->corpse_dup = FALSE;
         *out_thread = new_thread;
  
         {
@@ -1039,14 +1228,14 @@ thread_create_internal(
                 kdbg_trace_data(parent_task->bsd_info, &dbg_arg2);
  
                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
-                       TRACEDBG_CODE(DBG_TRACE_DATA, 1) | DBG_FUNC_NONE,
+                       TRACE_DATA_NEWTHREAD | DBG_FUNC_NONE,
                         (vm_address_t)(uintptr_t)thread_tid(new_thread), dbg_arg2, 0, 0, 0);
  
                 kdbg_trace_string(parent_task->bsd_info,
                                                         &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4);
  
                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
-                       TRACEDBG_CODE(DBG_TRACE_STRING, 1) | DBG_FUNC_NONE,
+                       TRACE_STRING_NEWTHREAD | DBG_FUNC_NONE,
                         dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0);
         }
  
@@ -1121,14 +1310,14 @@ thread_create_with_continuation(
  
  static kern_return_t
  thread_create_running_internal2(
-       register task_t         task,
+       task_t         task,
         int                     flavor,
         thread_state_t          new_state,
         mach_msg_type_number_t  new_state_count,
         thread_t                                *new_thread,
         boolean_t                               from_user)
  {
-       register kern_return_t  result;
+       kern_return_t  result;
         thread_t                                thread;
  
         if (task == TASK_NULL || task == kernel_task)
@@ -1138,6 +1327,9 @@ thread_create_running_internal2(
         if (result != KERN_SUCCESS)
                 return (result);
  
+       if (task->suspend_count > 0)
+               thread_hold(thread);
+
         result = machine_thread_set_state(thread, flavor, new_state, new_state_count);
         if (result != KERN_SUCCESS) {
                 task_unlock(task);
@@ -1149,7 +1341,7 @@ thread_create_running_internal2(
         }
  
         thread_mtx_lock(thread);
-       thread_start_internal(thread);
+       thread_start(thread);
         thread_mtx_unlock(thread);
  
         if (from_user)
@@ -1166,7 +1358,7 @@ thread_create_running_internal2(
  /* Prototype, see justification above */
  kern_return_t
  thread_create_running(
-       register task_t         task,
+       task_t         task,
         int                     flavor,
         thread_state_t          new_state,
         mach_msg_type_number_t  new_state_count,
@@ -1174,7 +1366,7 @@ thread_create_running(
  
  kern_return_t
  thread_create_running(
-       register task_t         task,
+       task_t         task,
         int                     flavor,
         thread_state_t          new_state,
         mach_msg_type_number_t  new_state_count,
@@ -1187,7 +1379,7 @@ thread_create_running(
  
  kern_return_t
  thread_create_running_from_user(
-       register task_t         task,
+       task_t         task,
         int                     flavor,
         thread_state_t          new_state,
         mach_msg_type_number_t  new_state_count,
@@ -1227,6 +1419,39 @@ thread_create_workq(
         return (KERN_SUCCESS);
  }
  
+kern_return_t
+thread_create_workq_waiting(
+       task_t              task,
+       thread_continue_t   thread_return,
+       event_t             event,
+       thread_t            *new_thread)
+{
+       thread_t            thread;
+       kern_return_t       result;
+
+       if (task == TASK_NULL || task == kernel_task)
+               return KERN_INVALID_ARGUMENT;
+
+       result = thread_create_internal(task, -1, thread_return, TH_OPTION_NOCRED | TH_OPTION_NOSUSP, &thread);
+
+       if (result != KERN_SUCCESS)
+               return result;
+
+       if (task->suspend_count > 0)
+               thread_hold(thread);
+
+       thread_mtx_lock(thread);
+       thread_start_in_assert_wait(thread, event, THREAD_INTERRUPTIBLE);
+       thread_mtx_unlock(thread);
+
+       task_unlock(task);
+       lck_mtx_unlock(&tasks_threads_lock);
+
+       *new_thread = thread;
+
+       return result;
+}
+
  /*
   *     kernel_thread_create:
   *
@@ -1281,7 +1506,7 @@ kernel_thread_start_priority(
         *new_thread = thread;   
  
         thread_mtx_lock(thread);
-       thread_start_internal(thread);
+       thread_start(thread);
         thread_mtx_unlock(thread);
  
         return (result);
@@ -1373,7 +1598,7 @@ retrieve_thread_basic_info(thread_t thread, thread_basic_info_t basic_info)
  
  kern_return_t
  thread_info_internal(
-       register thread_t               thread,
+       thread_t                thread,
         thread_flavor_t                 flavor,
         thread_info_t                   thread_info_out,        /* ptr to OUT array */
         mach_msg_type_number_t  *thread_info_count)     /*IN/OUT*/
@@ -1402,7 +1627,7 @@ thread_info_internal(
         }
         else
         if (flavor == THREAD_IDENTIFIER_INFO) {
-               register thread_identifier_info_t       identifier_info;
+               thread_identifier_info_t        identifier_info;
  
                 if (*thread_info_count < THREAD_IDENTIFIER_INFO_COUNT)
                         return (KERN_INVALID_ARGUMENT);
@@ -1710,6 +1935,12 @@ thread_wire(
  }
  
  
+boolean_t
+is_vm_privileged(void)
+{
+       return current_thread()->options & TH_OPT_VMPRIV ? TRUE : FALSE;
+}
+
  boolean_t
  set_vm_privilege(boolean_t privileged)
  {
@@ -1810,12 +2041,12 @@ thread_cputime_callback(int warning, __unused const void *arg0, __unused const v
  #endif
  
         if (warning == 0) {
-               THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE();
+               SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU();
         }
  }
  
  void __attribute__((noinline))
-THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(void)
+SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void)
  {
         int          pid                = 0;
         task_t           task                           = current_task();
@@ -1827,41 +2058,45 @@ THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(void)
         time_value_t thread_user_time;
         int          action;
         uint8_t      percentage;
-       uint32_t     limit_percent;
-       uint32_t     usage_percent;
+       uint32_t     usage_percent = 0;
         uint32_t     interval_sec;
         uint64_t     interval_ns;
         uint64_t     balance_ns;
         boolean_t        fatal = FALSE;
+       boolean_t        send_exc_resource = TRUE; /* in addition to RESOURCE_NOTIFY */
+       kern_return_t   kr;
  
+#ifdef EXC_RESOURCE_MONITORS
         mach_exception_data_type_t      code[EXCEPTION_CODE_MAX];
+#endif /* EXC_RESOURCE_MONITORS */
         struct ledger_entry_info        lei;
  
         assert(thread->t_threadledger != LEDGER_NULL);
  
         /*
-        * Now that a thread has tripped the monitor, disable it for the entire task.
+        * Extract the fatal bit and suspend the monitor (which clears the bit).
          */
         task_lock(task);
-       if ((task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) == 0) {
-               /*
-                * The CPU usage monitor has been disabled on our task, so some other
-                * thread must have gotten here first. We only send one exception per
-                * task lifetime, so there's nothing left for us to do here.
-                */
-               task_unlock(task);
-               return;
-       }
         if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_FATAL_CPUMON) {
                 fatal = TRUE;
+               send_exc_resource = TRUE;
         }
-       task_disable_cpumon(task);
+       /* Only one thread can be here at a time.  Whichever makes it through
+          first will successfully suspend the monitor and proceed to send the
+          notification.  Other threads will get an error trying to suspend the
+          monitor and give up on sending the notification.  In the first release,
+          the monitor won't be resumed for a number of seconds, but we may
+          eventually need to handle low-latency resume.
+        */
+       kr = task_suspend_cpumon(task);
         task_unlock(task);
+       if (kr == KERN_INVALID_ARGUMENT)        return;
  
  #ifdef MACH_BSD
         pid = proc_selfpid();
-       if (task->bsd_info != NULL)
+       if (task->bsd_info != NULL) {
                 procname = proc_name_address(task->bsd_info);
+       }
  #endif
  
         thread_get_cpulimit(&action, &percentage, &interval_ns);
@@ -1871,58 +2106,80 @@ THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(void)
         thread_read_times(thread, &thread_user_time, &thread_system_time);
         time_value_add(&thread_total_time, &thread_user_time);
         time_value_add(&thread_total_time, &thread_system_time);
-
         ledger_get_entry_info(thread->t_threadledger, thread_ledgers.cpu_time, &lei);
  
+       /* credit/debit/balance/limit are in absolute time units;
+          the refill info is in nanoseconds. */
         absolutetime_to_nanoseconds(lei.lei_balance, &balance_ns);
-       usage_percent = (uint32_t) ((balance_ns * 100ULL) / lei.lei_last_refill);
-
-       /* Show refill period in the same units as balance, limit, etc */
-       nanoseconds_to_absolutetime(lei.lei_refill_period, &lei.lei_refill_period);
+       if (lei.lei_last_refill > 0) {
+               usage_percent = (uint32_t)((balance_ns*100ULL) / lei.lei_last_refill);
+       }
  
-       limit_percent = (uint32_t) ((lei.lei_limit * 100ULL) / lei.lei_refill_period);
+       /* TODO: show task total runtime (via TASK_ABSOLUTETIME_INFO)? */
+       printf("process %s[%d] thread %llu caught burning CPU! "
+              "It used more than %d%% CPU over %u seconds "
+              "(actual recent usage: %d%% over ~%llu seconds).  "
+              "Thread lifetime cpu usage %d.%06ds, (%d.%06d user, %d.%06d sys) "
+              "ledger balance: %lld mabs credit: %lld mabs debit: %lld mabs "
+              "limit: %llu mabs period: %llu ns last refill: %llu ns%s.\n",
+              procname, pid, tid,
+              percentage, interval_sec,
+              usage_percent,
+              (lei.lei_last_refill + NSEC_PER_SEC/2) / NSEC_PER_SEC,
+              thread_total_time.seconds, thread_total_time.microseconds,
+              thread_user_time.seconds, thread_user_time.microseconds,
+              thread_system_time.seconds,thread_system_time.microseconds,
+              lei.lei_balance, lei.lei_credit, lei.lei_debit,
+              lei.lei_limit, lei.lei_refill_period, lei.lei_last_refill,
+              (fatal ? " [fatal violation]" : ""));
  
-       /*  TODO: show task total runtime as well? see TASK_ABSOLUTETIME_INFO */
+       /*
+          For now, send RESOURCE_NOTIFY in parallel with EXC_RESOURCE.  Once
+          we have logging parity, we will stop sending EXC_RESOURCE (24508922).
+        */
  
-       if (disable_exc_resource) {
-               printf("process %s[%d] thread %llu caught burning CPU!; EXC_RESOURCE "
-                       "supressed by a boot-arg\n", procname, pid, tid);
-               return;
+       /* RESOURCE_NOTIFY MIG specifies nanoseconds of CPU time */
+       lei.lei_balance = balance_ns;
+       absolutetime_to_nanoseconds(lei.lei_limit, &lei.lei_limit);
+       trace_resource_violation(RMON_CPUUSAGE_VIOLATED, &lei);
+       kr = send_resource_violation(send_cpu_usage_violation, task, &lei,
+                                                                fatal ? kRNFatalLimitFlag : 0);
+       if (kr) {
+               printf("send_resource_violation(CPU usage, ...): error %#x\n", kr);
         }
  
-       if (audio_active) {
-               printf("process %s[%d] thread %llu caught burning CPU!; EXC_RESOURCE "
-                      "supressed due to audio playback\n", procname, pid, tid);
-               return;
+#ifdef EXC_RESOURCE_MONITORS
+       if (send_exc_resource) {
+               if (disable_exc_resource) {
+                       printf("process %s[%d] thread %llu caught burning CPU! "
+                                  "EXC_RESOURCE%s supressed by a boot-arg\n",
+                                  procname, pid, tid, fatal ? " (and termination)" : "");
+                       return;
+               }
+
+               if (audio_active) {
+                       printf("process %s[%d] thread %llu caught burning CPU! "
+                          "EXC_RESOURCE & termination supressed due to audio playback\n",
+                                  procname, pid, tid);
+                       return;
+               }
         }
-       printf("process %s[%d] thread %llu caught burning CPU! "
-              "It used more than %d%% CPU (Actual recent usage: %d%%) over %d seconds. "
-              "thread lifetime cpu usage %d.%06d seconds, (%d.%06d user, %d.%06d system) "
-              "ledger info: balance: %lld credit: %lld debit: %lld limit: %llu (%d%%) "
-              "period: %llu time since last refill (ns): %llu %s\n",
-              procname, pid, tid,
-              percentage, usage_percent,  interval_sec,
-              thread_total_time.seconds,  thread_total_time.microseconds,
-              thread_user_time.seconds,   thread_user_time.microseconds,
-              thread_system_time.seconds, thread_system_time.microseconds,
-              lei.lei_balance,
-              lei.lei_credit,             lei.lei_debit,
-              lei.lei_limit,              limit_percent,
-              lei.lei_refill_period,      lei.lei_last_refill,
-              (fatal ? "[fatal violation]" : ""));
-
-
-       code[0] = code[1] = 0;
-       EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_CPU);
-       if (fatal) {
-               EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR_FATAL);
-       }else {
-               EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR);
+
+
+       if (send_exc_resource) {
+               code[0] = code[1] = 0;
+               EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_CPU);
+               if (fatal) {
+                       EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR_FATAL);
+               }else {
+                       EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR);
+               }
+               EXC_RESOURCE_CPUMONITOR_ENCODE_INTERVAL(code[0], interval_sec);
+               EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[0], percentage);
+               EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[1], usage_percent);
+               exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX);
         }
-       EXC_RESOURCE_CPUMONITOR_ENCODE_INTERVAL(code[0], interval_sec);
-       EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[0], limit_percent);
-       EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[1], usage_percent);
-       exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX);
+#endif /* EXC_RESOURCE_MONITORS */
  
         if (fatal) {
  #if CONFIG_JETSAM
@@ -1965,9 +2222,13 @@ void thread_update_io_stats(thread_t thread, int size, int io_flags)
         UPDATE_IO_STATS(thread->thread_io_stats->total_io, size);
         UPDATE_IO_STATS_ATOMIC(thread->task->task_io_stats->total_io, size);
  
+       if (!(io_flags & DKIO_READ)) {
+               DTRACE_IO3(physical_writes, struct task *, thread->task, uint32_t, size, int, io_flags);
+               ledger_credit(thread->task->ledger, task_ledgers.physical_writes, size);
+       }
  }
  
-void
+static void
  init_thread_ledgers(void) {
         ledger_template_t t;
         int idx;
@@ -2153,6 +2414,39 @@ thread_sched_call(
         thread->sched_call = (call != NULL)? call: sched_call_null;
  }
  
+sched_call_t
+thread_disable_sched_call(
+       thread_t                thread,
+       sched_call_t    call)
+{
+       if (call) {
+               spl_t s = splsched();
+               thread_lock(thread);
+               if (thread->sched_call == call) {
+                       thread->sched_call = sched_call_null;
+               } else {
+                       call = NULL;
+               }
+               thread_unlock(thread);
+               splx(s);
+       }
+       return call;
+}
+
+void
+thread_reenable_sched_call(
+       thread_t                thread,
+       sched_call_t    call)
+{
+       if (call) {
+               spl_t s = splsched();
+               thread_lock(thread);
+               thread_sched_call(thread, call);
+               thread_unlock(thread);
+               splx(s);
+       }
+}
+
  void
  thread_static_param(
         thread_t                thread,
@@ -2181,19 +2475,24 @@ uint64_t
  thread_dispatchqaddr(
         thread_t                thread)
  {
-       uint64_t        dispatchqueue_addr = 0;
-       uint64_t        thread_handle = 0;
+       uint64_t        dispatchqueue_addr;
+       uint64_t        thread_handle;
  
-       if (thread != THREAD_NULL) {
-               thread_handle = thread->machine.cthread_self;
-               
-                if (thread->inspection == TRUE)
-                       dispatchqueue_addr = thread_handle + get_task_dispatchqueue_offset(thread->task);
-                else if (thread->task->bsd_info)
-                       dispatchqueue_addr = thread_handle + get_dispatchqueue_offset_from_proc(thread->task->bsd_info);
-       }
+       if (thread == THREAD_NULL)
+               return 0;
+
+       thread_handle = thread->machine.cthread_self;
+       if (thread_handle == 0)
+               return 0;
+       
+       if (thread->inspection == TRUE)
+               dispatchqueue_addr = thread_handle + get_task_dispatchqueue_offset(thread->task);
+       else if (thread->task->bsd_info)
+               dispatchqueue_addr = thread_handle + get_dispatchqueue_offset_from_proc(thread->task->bsd_info);
+       else
+               dispatchqueue_addr = 0;
  
-       return (dispatchqueue_addr);
+       return dispatchqueue_addr;
  }
  
  /*
@@ -2509,6 +2808,28 @@ thread_get_current_voucher_origin_pid(
         return kr;
  }
  
+boolean_t
+thread_has_thread_name(thread_t th)
+{
+       if ((th) && (th->uthread)) {
+               return bsd_hasthreadname(th->uthread);
+       }
+
+       /*
+        * This is an odd case; clients may set the thread name based on the lack of
+        * a name, but in this context there is no uthread to attach the name to.
+        */
+       return FALSE;
+}
+
+void
+thread_set_thread_name(thread_t th, const char* name)
+{
+       if ((th) && (th->uthread) && name) {
+               bsd_setthreadname(th->uthread, name);
+       }
+}
+
  /*
   * thread_enable_send_importance - set/clear the SEND_IMPORTANCE thread option bit.
   */
@@ -2633,6 +2954,8 @@ void dtrace_thread_bootstrap(void)
                 if (thread->t_dtrace_flags & TH_DTRACE_EXECSUCCESS) {
                         thread->t_dtrace_flags &= ~TH_DTRACE_EXECSUCCESS;
                         DTRACE_PROC(exec__success);
+                       KDBG(BSDDBG_CODE(DBG_BSD_PROC,BSD_PROC_EXEC),
+                            task_pid(task));
                 }
                 DTRACE_PROC(start);
         }