osfmk/kern/task_policy.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <mach/mach_types.h>
  30 #include <mach/task_server.h>
  31
  32 #include <kern/sched.h>
  33 #include <kern/task.h>
  34 #include <mach/thread_policy.h>
  35 #include <sys/errno.h>
  36 #include <sys/resource.h>
  37 #include <machine/limits.h>
  38 #include <kern/ledger.h>
  39 #include <kern/thread_call.h>
  40 #include <kern/sfi.h>
  41 #include <kern/coalition.h>
  42 #if CONFIG_TELEMETRY
  43 #include <kern/telemetry.h>
  44 #endif
  45
  46 #if IMPORTANCE_INHERITANCE
  47 #include <ipc/ipc_importance.h>
  48 #if IMPORTANCE_DEBUG
  49 #include <mach/machine/sdt.h>
  50 #endif /* IMPORTANCE_DEBUG */
  51 #endif /* IMPORTANCE_INHERITACE */
  52
  53 #include <sys/kdebug.h>
  54
  55 /*
  56  *  Task Policy
  57  *
  58  *  This subsystem manages task and thread IO priority and backgrounding,
  59  *  as well as importance inheritance, process suppression, task QoS, and apptype.
  60  *  These properties have a suprising number of complex interactions, so they are
  61  *  centralized here in one state machine to simplify the implementation of those interactions.
  62  *
  63  *  Architecture:
  64  *  Threads and tasks have three policy fields: requested, effective, and pending.
  65  *  Requested represents the wishes of each interface that influences task policy.
  66  *  Effective represents the distillation of that policy into a set of behaviors.
  67  *  Pending represents updates that haven't been applied yet.
  68  *
  69  *  Each interface that has an input into the task policy state machine controls a field in requested.
  70  *  If the interface has a getter, it returns what is in the field in requested, but that is
  71  *  not necessarily what is actually in effect.
  72  *
  73  *  All kernel subsystems that behave differently based on task policy call into
  74  *  the get_effective_policy function, which returns the decision of the task policy state machine
  75  *  for that subsystem by querying only the 'effective' field.
  76  *
  77  *  Policy change operations:
  78  *  Here are the steps to change a policy on a task or thread:
  79  *  1) Lock task
  80  *  2) Change requested field for the relevant policy
  81  *  3) Run a task policy update, which recalculates effective based on requested,
  82  *     then takes a diff between the old and new versions of requested and calls the relevant
  83  *     other subsystems to apply these changes, and updates the pending field.
  84  *  4) Unlock task
  85  *  5) Run task policy update complete, which looks at the pending field to update
  86  *     subsystems which cannot be touched while holding the task lock.
  87  *
  88  *  To add a new requested policy, add the field in the requested struct, the flavor in task.h,
  89  *  the setter and getter in proc_(set|get)_task_policy*, and dump the state in task_requested_bitfield,
  90  *  then set up the effects of that behavior in task_policy_update*. If the policy manifests
  91  *  itself as a distinct effective policy, add it to the effective struct and add it to the
  92  *  proc_get_effective_policy accessor.
  93  *
  94  *  Most policies are set via proc_set_task_policy, but policies that don't fit that interface
  95  *  roll their own lock/set/update/unlock/complete code inside this file.
  96  *
  97  *
  98  *  Suppression policy
  99  *
 100  *  These are a set of behaviors that can be requested for a task.  They currently have specific
 101  *  implied actions when they're enabled, but they may be made customizable in the future.
 102  *
 103  *  When the affected task is boosted, we temporarily disable the suppression behaviors
 104  *  so that the affected process has a chance to run so it can call the API to permanently
 105  *  disable the suppression behaviors.
 106  *
 107  *  Locking
 108  *
 109  *  Changing task policy on a task or thread takes the task lock, and not the thread lock.
 110  *  TODO: Should changing policy on a thread take the thread lock instead?
 111  *
 112  *  Querying the effective policy does not take the task lock, to prevent deadlocks or slowdown in sensitive code.
 113  *  This means that any notification of state change needs to be externally synchronized.
 114  *
 115  */
 116
 117 extern const qos_policy_params_t thread_qos_policy_params;
 118
 119 /* for task holds without dropping the lock */
 120 extern void task_hold_locked(task_t task);
 121 extern void task_release_locked(task_t task);
 122 extern void task_wait_locked(task_t task, boolean_t until_not_runnable);
 123
 124 extern void thread_recompute_qos(thread_t thread);
 125
 126 /* Task policy related helper functions */
 127 static void proc_set_task_policy_locked(task_t task, thread_t thread, int category, int flavor, int value);
 128 static void proc_set_task_policy2_locked(task_t task, thread_t thread, int category, int flavor, int value1, int value2);
 129
 130 static void task_policy_update_locked(task_t task, thread_t thread, task_pend_token_t pend_token);
 131 static void task_policy_update_internal_locked(task_t task, thread_t thread, boolean_t in_create, task_pend_token_t pend_token);
 132 static void task_policy_update_task_locked(task_t task, boolean_t update_throttle, boolean_t update_bg_throttle, boolean_t update_sfi);
 133 static void task_policy_update_thread_locked(thread_t thread, int update_cpu, boolean_t update_throttle, boolean_t update_sfi, boolean_t update_qos);
 134 static boolean_t task_policy_update_coalition_focal_tasks(task_t task, int prev_role, int next_role);
 135
 136 static int proc_get_effective_policy(task_t task, thread_t thread, int policy);
 137
 138 static void proc_iopol_to_tier(int iopolicy, int *tier, int *passive);
 139 static int proc_tier_to_iopol(int tier, int passive);
 140
 141 static uintptr_t trequested_0(task_t task, thread_t thread);
 142 static uintptr_t trequested_1(task_t task, thread_t thread);
 143 static uintptr_t teffective_0(task_t task, thread_t thread);
 144 static uintptr_t teffective_1(task_t task, thread_t thread);
 145 static uint32_t tpending(task_pend_token_t pend_token);
 146 static uint64_t task_requested_bitfield(task_t task, thread_t thread);
 147 static uint64_t task_effective_bitfield(task_t task, thread_t thread);
 148
 149 void proc_get_thread_policy(thread_t thread, thread_policy_state_t info);
 150
 151 /* CPU Limits related helper functions */
 152 static int task_get_cpuusage(task_t task, uint8_t *percentagep, uint64_t *intervalp, uint64_t *deadlinep, int *scope);
 153 int task_set_cpuusage(task_t task, uint8_t percentage, uint64_t interval, uint64_t deadline, int scope, int entitled);
 154 static int task_clear_cpuusage_locked(task_t task, int cpumon_entitled);
 155 int task_disable_cpumon(task_t task);
 156 static int task_apply_resource_actions(task_t task, int type);
 157 void task_action_cpuusage(thread_call_param_t param0, thread_call_param_t param1);
 158 void proc_init_cpumon_params(void);
 159
 160 #ifdef MACH_BSD
 161 int             proc_pid(void *proc);
 162 extern int      proc_selfpid(void);
 163 extern char *   proc_name_address(void *p);
 164 extern void     rethrottle_thread(void * uthread);
 165 extern void     proc_apply_task_networkbg(void * bsd_info, thread_t thread);
 166 #endif /* MACH_BSD */
 167
 168 extern zone_t thread_qos_override_zone;
 169 static boolean_t _proc_thread_qos_remove_override_internal(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type, boolean_t reset);
 170
 171
 172 /* Importance Inheritance related helper functions */
 173
 174 #if IMPORTANCE_INHERITANCE
 175
 176 static void task_add_importance_watchport(task_t task, mach_port_t port, int *boostp);
 177 static void task_importance_update_live_donor(task_t target_task);
 178
 179 #endif /* IMPORTANCE_INHERITANCE */
 180
 181 #if IMPORTANCE_DEBUG
 182 #define __impdebug_only
 183 #else
 184 #define __impdebug_only __unused
 185 #endif
 186
 187 #if IMPORTANCE_INHERITANCE
 188 #define __imp_only
 189 #else
 190 #define __imp_only __unused
 191 #endif
 192
 193 #define TASK_LOCKED   1
 194 #define TASK_UNLOCKED 0
 195
 196 #define DO_LOWPRI_CPU   1
 197 #define UNDO_LOWPRI_CPU 2
 198
 199 /* Macros for making tracing simpler */
 200
 201 #define tpriority(task, thread)  ((uintptr_t)(thread == THREAD_NULL ? (task->priority)  : (thread->priority)))
 202 #define tisthread(thread) (thread == THREAD_NULL ? TASK_POLICY_TASK  : TASK_POLICY_THREAD)
 203 #define targetid(task, thread)   ((uintptr_t)(thread == THREAD_NULL ? (audit_token_pid_from_task(task)) : (thread->thread_id)))
 204
 205 /*
 206  * Default parameters for certain policies
 207  */
 208
 209 int proc_standard_daemon_tier = THROTTLE_LEVEL_TIER1;
 210 int proc_suppressed_disk_tier = THROTTLE_LEVEL_TIER1;
 211 int proc_tal_disk_tier        = THROTTLE_LEVEL_TIER1;
 212
 213 int proc_graphics_timer_qos   = (LATENCY_QOS_TIER_0 & 0xFF);
 214
 215 const int proc_default_bg_iotier  = THROTTLE_LEVEL_TIER2;
 216
 217 /* Latency/throughput QoS fields remain zeroed, i.e. TIER_UNSPECIFIED at creation */
 218 const struct task_requested_policy default_task_requested_policy = {
 219         .bg_iotier = proc_default_bg_iotier
 220 };
 221 const struct task_effective_policy default_task_effective_policy = {};
 222 const struct task_pended_policy default_task_pended_policy = {};
 223
 224 /*
 225  * Default parameters for CPU usage monitor.
 226  *
 227  * Default setting is 50% over 3 minutes.
 228  */
 229 #define         DEFAULT_CPUMON_PERCENTAGE 50
 230 #define         DEFAULT_CPUMON_INTERVAL   (3 * 60)
 231
 232 uint8_t         proc_max_cpumon_percentage;
 233 uint64_t        proc_max_cpumon_interval;
 234
 235 kern_return_t
 236 qos_latency_policy_validate(task_latency_qos_t ltier) {
 237         if ((ltier != LATENCY_QOS_TIER_UNSPECIFIED) &&
 238             ((ltier > LATENCY_QOS_TIER_5) || (ltier < LATENCY_QOS_TIER_0)))
 239                 return KERN_INVALID_ARGUMENT;
 240
 241         return KERN_SUCCESS;
 242 }
 243
 244 kern_return_t
 245 qos_throughput_policy_validate(task_throughput_qos_t ttier) {
 246         if ((ttier != THROUGHPUT_QOS_TIER_UNSPECIFIED) &&
 247             ((ttier > THROUGHPUT_QOS_TIER_5) || (ttier < THROUGHPUT_QOS_TIER_0)))
 248                 return KERN_INVALID_ARGUMENT;
 249
 250         return KERN_SUCCESS;
 251 }
 252
 253 static kern_return_t
 254 task_qos_policy_validate(task_qos_policy_t qosinfo, mach_msg_type_number_t count) {
 255         if (count < TASK_QOS_POLICY_COUNT)
 256                 return KERN_INVALID_ARGUMENT;
 257
 258         task_latency_qos_t ltier = qosinfo->task_latency_qos_tier;
 259         task_throughput_qos_t ttier = qosinfo->task_throughput_qos_tier;
 260
 261         kern_return_t kr = qos_latency_policy_validate(ltier);
 262
 263         if (kr != KERN_SUCCESS)
 264                 return kr;
 265
 266         kr = qos_throughput_policy_validate(ttier);
 267
 268         return kr;
 269 }
 270
 271 uint32_t
 272 qos_extract(uint32_t qv) {
 273         return (qv & 0xFF);
 274 }
 275
 276 uint32_t
 277 qos_latency_policy_package(uint32_t qv) {
 278         return (qv == LATENCY_QOS_TIER_UNSPECIFIED) ? LATENCY_QOS_TIER_UNSPECIFIED : ((0xFF << 16) | qv);
 279 }
 280
 281 uint32_t
 282 qos_throughput_policy_package(uint32_t qv) {
 283         return (qv == THROUGHPUT_QOS_TIER_UNSPECIFIED) ? THROUGHPUT_QOS_TIER_UNSPECIFIED : ((0xFE << 16) | qv);
 284 }
 285
 286 /* TEMPORARY boot-arg controlling task_policy suppression (App Nap) */
 287 static boolean_t task_policy_suppression_disable = FALSE;
 288
 289 kern_return_t
 290 task_policy_set(
 291         task_t                                  task,
 292         task_policy_flavor_t    flavor,
 293         task_policy_t                   policy_info,
 294         mach_msg_type_number_t  count)
 295 {
 296         kern_return_t           result = KERN_SUCCESS;
 297
 298         if (task == TASK_NULL || task == kernel_task)
 299                 return (KERN_INVALID_ARGUMENT);
 300
 301         switch (flavor) {
 302
 303         case TASK_CATEGORY_POLICY: {
 304                 task_category_policy_t info = (task_category_policy_t)policy_info;
 305
 306                 if (count < TASK_CATEGORY_POLICY_COUNT)
 307                         return (KERN_INVALID_ARGUMENT);
 308
 309
 310                 switch(info->role) {
 311                         case TASK_FOREGROUND_APPLICATION:
 312                         case TASK_BACKGROUND_APPLICATION:
 313                         case TASK_DEFAULT_APPLICATION:
 314                                 proc_set_task_policy(task, THREAD_NULL,
 315                                                      TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE,
 316                                                      info->role);
 317                                 break;
 318
 319                         case TASK_CONTROL_APPLICATION:
 320                                 if (task != current_task() || task->sec_token.val[0] != 0)
 321                                         result = KERN_INVALID_ARGUMENT;
 322                                 else
 323                                         proc_set_task_policy(task, THREAD_NULL,
 324                                                              TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE,
 325                                                              info->role);
 326                                 break;
 327
 328                         case TASK_GRAPHICS_SERVER:
 329                                 /* TODO: Restrict this role to FCFS <rdar://problem/12552788> */
 330                                 if (task != current_task() || task->sec_token.val[0] != 0)
 331                                         result = KERN_INVALID_ARGUMENT;
 332                                 else
 333                                         proc_set_task_policy(task, THREAD_NULL,
 334                                                              TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE,
 335                                                              info->role);
 336                                 break;
 337                         default:
 338                                 result = KERN_INVALID_ARGUMENT;
 339                                 break;
 340                 } /* switch (info->role) */
 341
 342                 break;
 343         }
 344
 345 /* Desired energy-efficiency/performance "quality-of-service" */
 346         case TASK_BASE_QOS_POLICY:
 347         case TASK_OVERRIDE_QOS_POLICY:
 348         {
 349                 task_qos_policy_t qosinfo = (task_qos_policy_t)policy_info;
 350                 kern_return_t kr = task_qos_policy_validate(qosinfo, count);
 351
 352                 if (kr != KERN_SUCCESS)
 353                         return kr;
 354
 355
 356                 uint32_t lqos = qos_extract(qosinfo->task_latency_qos_tier);
 357                 uint32_t tqos = qos_extract(qosinfo->task_throughput_qos_tier);
 358
 359                 proc_set_task_policy2(task, THREAD_NULL, TASK_POLICY_ATTRIBUTE,
 360                                                           flavor == TASK_BASE_QOS_POLICY ? TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS : TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS,
 361                                                           lqos, tqos);
 362         }
 363         break;
 364
 365         case TASK_BASE_LATENCY_QOS_POLICY:
 366         {
 367                 task_qos_policy_t qosinfo = (task_qos_policy_t)policy_info;
 368                 kern_return_t kr = task_qos_policy_validate(qosinfo, count);
 369
 370                 if (kr != KERN_SUCCESS)
 371                         return kr;
 372
 373                 uint32_t lqos = qos_extract(qosinfo->task_latency_qos_tier);
 374
 375                 proc_set_task_policy(task, NULL, TASK_POLICY_ATTRIBUTE, TASK_BASE_LATENCY_QOS_POLICY, lqos);
 376         }
 377         break;
 378
 379         case TASK_BASE_THROUGHPUT_QOS_POLICY:
 380         {
 381                 task_qos_policy_t qosinfo = (task_qos_policy_t)policy_info;
 382                 kern_return_t kr = task_qos_policy_validate(qosinfo, count);
 383
 384                 if (kr != KERN_SUCCESS)
 385                         return kr;
 386
 387                 uint32_t tqos = qos_extract(qosinfo->task_throughput_qos_tier);
 388
 389                 proc_set_task_policy(task, NULL, TASK_POLICY_ATTRIBUTE, TASK_BASE_THROUGHPUT_QOS_POLICY, tqos);
 390         }
 391         break;
 392
 393         case TASK_SUPPRESSION_POLICY:
 394         {
 395
 396                 task_suppression_policy_t info = (task_suppression_policy_t)policy_info;
 397
 398                 if (count < TASK_SUPPRESSION_POLICY_COUNT)
 399                         return (KERN_INVALID_ARGUMENT);
 400
 401                 struct task_qos_policy qosinfo;
 402
 403                 qosinfo.task_latency_qos_tier = info->timer_throttle;
 404                 qosinfo.task_throughput_qos_tier = info->throughput_qos;
 405
 406                 kern_return_t kr = task_qos_policy_validate(&qosinfo, TASK_QOS_POLICY_COUNT);
 407
 408                 if (kr != KERN_SUCCESS)
 409                         return kr;
 410
 411                 /* TEMPORARY disablement of task suppression */
 412                 if (task_policy_suppression_disable && info->active)
 413                         return KERN_SUCCESS;
 414
 415                 struct task_pend_token pend_token = {};
 416
 417                 task_lock(task);
 418
 419                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 420                                           (IMPORTANCE_CODE(IMP_TASK_SUPPRESSION, info->active)) | DBG_FUNC_START,
 421                                           proc_selfpid(), audit_token_pid_from_task(task), trequested_0(task, THREAD_NULL),
 422                                           trequested_1(task, THREAD_NULL), 0);
 423
 424                 task->requested_policy.t_sup_active      = (info->active)         ? 1 : 0;
 425                 task->requested_policy.t_sup_lowpri_cpu  = (info->lowpri_cpu)     ? 1 : 0;
 426                 task->requested_policy.t_sup_timer       = qos_extract(info->timer_throttle);
 427                 task->requested_policy.t_sup_disk        = (info->disk_throttle)  ? 1 : 0;
 428                 task->requested_policy.t_sup_cpu_limit   = (info->cpu_limit)      ? 1 : 0;
 429                 task->requested_policy.t_sup_suspend     = (info->suspend)        ? 1 : 0;
 430                 task->requested_policy.t_sup_throughput  = qos_extract(info->throughput_qos);
 431                 task->requested_policy.t_sup_cpu         = (info->suppressed_cpu) ? 1 : 0;
 432                 task->requested_policy.t_sup_bg_sockets  = (info->background_sockets) ? 1 : 0;
 433
 434                 task_policy_update_locked(task, THREAD_NULL, &pend_token);
 435
 436                 task_unlock(task);
 437
 438                 task_policy_update_complete_unlocked(task, THREAD_NULL, &pend_token);
 439
 440                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 441                                           (IMPORTANCE_CODE(IMP_TASK_SUPPRESSION, info->active)) | DBG_FUNC_END,
 442                                           proc_selfpid(), audit_token_pid_from_task(task), trequested_0(task, THREAD_NULL),
 443                                           trequested_1(task, THREAD_NULL), 0);
 444
 445                 break;
 446
 447         }
 448
 449         default:
 450                 result = KERN_INVALID_ARGUMENT;
 451                 break;
 452         }
 453
 454         return (result);
 455 }
 456
 457 /* Sets BSD 'nice' value on the task */
 458 kern_return_t
 459 task_importance(
 460         task_t                          task,
 461         integer_t                       importance)
 462 {
 463         if (task == TASK_NULL || task == kernel_task)
 464                 return (KERN_INVALID_ARGUMENT);
 465
 466         task_lock(task);
 467
 468         if (!task->active) {
 469                 task_unlock(task);
 470
 471                 return (KERN_TERMINATED);
 472         }
 473
 474         if (proc_get_effective_task_policy(task, TASK_POLICY_ROLE) >= TASK_CONTROL_APPLICATION) {
 475                 task_unlock(task);
 476
 477                 return (KERN_INVALID_ARGUMENT);
 478         }
 479
 480         task->importance = importance;
 481
 482         /* TODO: tracepoint? */
 483
 484         /* Redrive only the task priority calculation */
 485         task_policy_update_task_locked(task, FALSE, FALSE, FALSE);
 486
 487         task_unlock(task);
 488
 489         return (KERN_SUCCESS);
 490 }
 491
 492 kern_return_t
 493 task_policy_get(
 494         task_t                                  task,
 495         task_policy_flavor_t    flavor,
 496         task_policy_t                   policy_info,
 497         mach_msg_type_number_t  *count,
 498         boolean_t                               *get_default)
 499 {
 500         if (task == TASK_NULL || task == kernel_task)
 501                 return (KERN_INVALID_ARGUMENT);
 502
 503         switch (flavor) {
 504
 505         case TASK_CATEGORY_POLICY:
 506         {
 507                 task_category_policy_t          info = (task_category_policy_t)policy_info;
 508
 509                 if (*count < TASK_CATEGORY_POLICY_COUNT)
 510                         return (KERN_INVALID_ARGUMENT);
 511
 512                 if (*get_default)
 513                         info->role = TASK_UNSPECIFIED;
 514                 else
 515                         info->role = proc_get_task_policy(task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE);
 516                 break;
 517         }
 518
 519         case TASK_BASE_QOS_POLICY: /* FALLTHRU */
 520         case TASK_OVERRIDE_QOS_POLICY:
 521         {
 522                 task_qos_policy_t info = (task_qos_policy_t)policy_info;
 523
 524                 if (*count < TASK_QOS_POLICY_COUNT)
 525                         return (KERN_INVALID_ARGUMENT);
 526
 527                 if (*get_default) {
 528                         info->task_latency_qos_tier = LATENCY_QOS_TIER_UNSPECIFIED;
 529                         info->task_throughput_qos_tier = THROUGHPUT_QOS_TIER_UNSPECIFIED;
 530                 } else if (flavor == TASK_BASE_QOS_POLICY) {
 531                         int value1, value2;
 532
 533                         proc_get_task_policy2(task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS, &value1, &value2);
 534
 535                         info->task_latency_qos_tier = qos_latency_policy_package(value1);
 536                         info->task_throughput_qos_tier = qos_throughput_policy_package(value2);
 537
 538                 } else if (flavor == TASK_OVERRIDE_QOS_POLICY) {
 539                         int value1, value2;
 540
 541                         proc_get_task_policy2(task, THREAD_NULL, TASK_POLICY_ATTRIBUTE, TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS, &value1, &value2);
 542
 543                         info->task_latency_qos_tier = qos_latency_policy_package(value1);
 544                         info->task_throughput_qos_tier = qos_throughput_policy_package(value2);
 545                 }
 546
 547                 break;
 548         }
 549
 550         case TASK_POLICY_STATE:
 551         {
 552                 task_policy_state_t info = (task_policy_state_t)policy_info;
 553
 554                 if (*count < TASK_POLICY_STATE_COUNT)
 555                         return (KERN_INVALID_ARGUMENT);
 556
 557                 /* Only root can get this info */
 558                 if (current_task()->sec_token.val[0] != 0)
 559                         return KERN_PROTECTION_FAILURE;
 560
 561                 if (*get_default) {
 562                         info->requested = 0;
 563                         info->effective = 0;
 564                         info->pending = 0;
 565                         info->imp_assertcnt = 0;
 566                         info->imp_externcnt = 0;
 567                         info->flags = 0;
 568                         info->imp_transitions = 0;
 569                 } else {
 570                         task_lock(task);
 571
 572                         info->requested = task_requested_bitfield(task, THREAD_NULL);
 573                         info->effective = task_effective_bitfield(task, THREAD_NULL);
 574                         info->pending   = 0;
 575
 576                         info->flags = 0;
 577                         if (task->task_imp_base != NULL) {
 578                                 info->imp_assertcnt = task->task_imp_base->iit_assertcnt;
 579                                 info->imp_externcnt = IIT_EXTERN(task->task_imp_base);
 580                                 info->flags |= (task_is_marked_importance_receiver(task) ? TASK_IMP_RECEIVER : 0);
 581                                 info->flags |= (task_is_marked_importance_denap_receiver(task) ? TASK_DENAP_RECEIVER : 0);
 582                                 info->flags |= (task_is_marked_importance_donor(task) ? TASK_IMP_DONOR : 0);
 583                                 info->flags |= (task_is_marked_live_importance_donor(task) ? TASK_IMP_LIVE_DONOR : 0);
 584                                 info->imp_transitions = task->task_imp_base->iit_transitions;
 585                         } else {
 586                                 info->imp_assertcnt = 0;
 587                                 info->imp_externcnt = 0;
 588                                 info->imp_transitions = 0;
 589                         }
 590                         task_unlock(task);
 591                 }
 592
 593                 info->reserved[0] = 0;
 594                 info->reserved[1] = 0;
 595
 596                 break;
 597         }
 598
 599         case TASK_SUPPRESSION_POLICY:
 600         {
 601                 task_suppression_policy_t info = (task_suppression_policy_t)policy_info;
 602
 603                 if (*count < TASK_SUPPRESSION_POLICY_COUNT)
 604                         return (KERN_INVALID_ARGUMENT);
 605
 606                 task_lock(task);
 607
 608                 if (*get_default) {
 609                         info->active            = 0;
 610                         info->lowpri_cpu        = 0;
 611                         info->timer_throttle    = LATENCY_QOS_TIER_UNSPECIFIED;
 612                         info->disk_throttle     = 0;
 613                         info->cpu_limit         = 0;
 614                         info->suspend           = 0;
 615                         info->throughput_qos    = 0;
 616                         info->suppressed_cpu    = 0;
 617                 } else {
 618                         info->active            = task->requested_policy.t_sup_active;
 619                         info->lowpri_cpu        = task->requested_policy.t_sup_lowpri_cpu;
 620                         info->timer_throttle    = qos_latency_policy_package(task->requested_policy.t_sup_timer);
 621                         info->disk_throttle     = task->requested_policy.t_sup_disk;
 622                         info->cpu_limit         = task->requested_policy.t_sup_cpu_limit;
 623                         info->suspend           = task->requested_policy.t_sup_suspend;
 624                         info->throughput_qos    = qos_throughput_policy_package(task->requested_policy.t_sup_throughput);
 625                         info->suppressed_cpu    = task->requested_policy.t_sup_cpu;
 626                         info->background_sockets = task->requested_policy.t_sup_bg_sockets;
 627                 }
 628
 629                 task_unlock(task);
 630                 break;
 631         }
 632
 633         default:
 634                 return (KERN_INVALID_ARGUMENT);
 635         }
 636
 637         return (KERN_SUCCESS);
 638 }
 639
 640 /*
 641  * Called at task creation
 642  * We calculate the correct effective but don't apply it to anything yet.
 643  * The threads, etc will inherit from the task as they get created.
 644  */
 645 void
 646 task_policy_create(task_t task, int parent_boosted)
 647 {
 648         if (task->requested_policy.t_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE) {
 649                 if (parent_boosted) {
 650                         task->requested_policy.t_apptype = TASK_APPTYPE_DAEMON_INTERACTIVE;
 651                         task_importance_mark_donor(task, TRUE);
 652                 } else {
 653                         task->requested_policy.t_apptype = TASK_APPTYPE_DAEMON_BACKGROUND;
 654                         task_importance_mark_receiver(task, FALSE);
 655                 }
 656         }
 657
 658         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 659                                   (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_TASK))) | DBG_FUNC_START,
 660                                   audit_token_pid_from_task(task), teffective_0(task, THREAD_NULL),
 661                                   teffective_1(task, THREAD_NULL), tpriority(task, THREAD_NULL), 0);
 662
 663         task_policy_update_internal_locked(task, THREAD_NULL, TRUE, NULL);
 664
 665         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 666                                   (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_TASK))) | DBG_FUNC_END,
 667                                   audit_token_pid_from_task(task), teffective_0(task, THREAD_NULL),
 668                                   teffective_1(task, THREAD_NULL), tpriority(task, THREAD_NULL), 0);
 669
 670         task_importance_update_live_donor(task);
 671         task_policy_update_task_locked(task, FALSE, FALSE, FALSE);
 672 }
 673
 674 void
 675 thread_policy_create(thread_t thread)
 676 {
 677         task_t task = thread->task;
 678
 679         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 680                                   (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_THREAD))) | DBG_FUNC_START,
 681                                   targetid(task, thread), teffective_0(task, thread),
 682                                   teffective_1(task, thread), tpriority(task, thread), 0);
 683
 684         task_policy_update_internal_locked(task, thread, TRUE, NULL);
 685
 686         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 687                                   (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_THREAD))) | DBG_FUNC_END,
 688                                   targetid(task, thread), teffective_0(task, thread),
 689                                   teffective_1(task, thread), tpriority(task, thread), 0);
 690 }
 691
 692 static void
 693 task_policy_update_locked(task_t task, thread_t thread, task_pend_token_t pend_token)
 694 {
 695         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 696                                   (IMPORTANCE_CODE(IMP_UPDATE, tisthread(thread)) | DBG_FUNC_START),
 697                                   targetid(task, thread), teffective_0(task, thread),
 698                                   teffective_1(task, thread), tpriority(task, thread), 0);
 699
 700         task_policy_update_internal_locked(task, thread, FALSE, pend_token);
 701
 702         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 703                                   (IMPORTANCE_CODE(IMP_UPDATE, tisthread(thread))) | DBG_FUNC_END,
 704                                   targetid(task, thread), teffective_0(task, thread),
 705                                   teffective_1(task, thread), tpriority(task, thread), 0);
 706 }
 707
 708 /*
 709  * One state update function TO RULE THEM ALL
 710  *
 711  * This function updates the task or thread effective policy fields
 712  * and pushes the results to the relevant subsystems.
 713  *
 714  * Must call update_complete after unlocking the task,
 715  * as some subsystems cannot be updated while holding the task lock.
 716  *
 717  * Called with task locked, not thread
 718  */
 719
 720 static void
 721 task_policy_update_internal_locked(task_t task, thread_t thread, boolean_t in_create, task_pend_token_t pend_token)
 722 {
 723         boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE;
 724
 725         /*
 726          * Step 1:
 727          *  Gather requested policy
 728          */
 729
 730         struct task_requested_policy requested =
 731                 (on_task) ? task->requested_policy : thread->requested_policy;
 732
 733
 734         /*
 735          * Step 2:
 736          *  Calculate new effective policies from requested policy and task state
 737          *  Rules:
 738          *      If in an 'on_task' block, must only look at and set fields starting with t_
 739          *      If operating on a task, don't touch anything starting with th_
 740          *      If operating on a thread, don't touch anything starting with t_
 741          *      Don't change requested, it won't take effect
 742          */
 743
 744         struct task_effective_policy next = {};
 745         struct task_effective_policy task_effective;
 746
 747         /* Calculate QoS policies */
 748
 749         if (on_task) {
 750                 /* Update task role */
 751                 next.t_role = requested.t_role;
 752
 753                 /* Set task qos clamp and ceiling */
 754                 next.t_qos_clamp = requested.t_qos_clamp;
 755
 756                 if (requested.t_apptype == TASK_APPTYPE_APP_DEFAULT ||
 757                     requested.t_apptype == TASK_APPTYPE_APP_TAL) {
 758
 759                         switch (next.t_role) {
 760                                 case TASK_FOREGROUND_APPLICATION:
 761                                         /* Foreground apps get urgent scheduler priority */
 762                                         next.qos_ui_is_urgent = 1;
 763                                         next.t_qos_ceiling = THREAD_QOS_UNSPECIFIED;
 764                                         break;
 765
 766                                 case TASK_BACKGROUND_APPLICATION:
 767                                         /* This is really 'non-focal but on-screen' */
 768                                         next.t_qos_ceiling = THREAD_QOS_UNSPECIFIED;
 769                                         break;
 770
 771                                 case TASK_DEFAULT_APPLICATION:
 772                                         /* This is 'may render UI but we don't know if it's focal/nonfocal' */
 773                                         next.t_qos_ceiling = THREAD_QOS_UNSPECIFIED;
 774                                         break;
 775
 776                                 case TASK_NONUI_APPLICATION:
 777                                         /* i.e. 'off-screen' */
 778                                         next.t_qos_ceiling = THREAD_QOS_LEGACY;
 779                                         break;
 780
 781                                 case TASK_CONTROL_APPLICATION:
 782                                 case TASK_GRAPHICS_SERVER:
 783                                         next.qos_ui_is_urgent = 1;
 784                                         next.t_qos_ceiling = THREAD_QOS_UNSPECIFIED;
 785                                         break;
 786
 787                                 case TASK_UNSPECIFIED:
 788                                 default:
 789                                         /* Apps that don't have an application role get
 790                                          * USER_INTERACTIVE and USER_INITIATED squashed to LEGACY */
 791                                         next.t_qos_ceiling = THREAD_QOS_LEGACY;
 792                                         break;
 793                         }
 794                 } else {
 795                         /* Daemons get USER_INTERACTIVE squashed to USER_INITIATED */
 796                         next.t_qos_ceiling = THREAD_QOS_USER_INITIATED;
 797                 }
 798         } else {
 799                 /*
 800                  * Set thread qos tier
 801                  * Note that an override only overrides the QoS field, not other policy settings.
 802                  * A thread must already be participating in QoS for override to take effect
 803                  */
 804
 805                 /* Snapshot the task's effective policy */
 806                 task_effective = task->effective_policy;
 807
 808                 next.qos_ui_is_urgent = task_effective.qos_ui_is_urgent;
 809
 810                 if ((requested.thrp_qos_override != THREAD_QOS_UNSPECIFIED) && (requested.thrp_qos != THREAD_QOS_UNSPECIFIED))
 811                         next.thep_qos = MAX(requested.thrp_qos_override, requested.thrp_qos);
 812                 else
 813                         next.thep_qos = requested.thrp_qos;
 814
 815                 /* A task clamp will result in an effective QoS even when requested is UNSPECIFIED */
 816                 if (task_effective.t_qos_clamp != THREAD_QOS_UNSPECIFIED) {
 817                         if (next.thep_qos != THREAD_QOS_UNSPECIFIED)
 818                                 next.thep_qos = MIN(task_effective.t_qos_clamp, next.thep_qos);
 819                         else
 820                                 next.thep_qos = task_effective.t_qos_clamp;
 821                 }
 822
 823                 /* The ceiling only applies to threads that are in the QoS world */
 824                 if (task_effective.t_qos_ceiling != THREAD_QOS_UNSPECIFIED &&
 825                     next.thep_qos                != THREAD_QOS_UNSPECIFIED) {
 826                         next.thep_qos = MIN(task_effective.t_qos_ceiling, next.thep_qos);
 827                 }
 828
 829                 /*
 830                  * The QoS relative priority is only applicable when the original programmer's
 831                  * intended (requested) QoS is in effect. When the QoS is clamped (e.g.
 832                  * USER_INITIATED-13REL clamped to UTILITY), the relative priority is not honored,
 833                  * since otherwise it would be lower than unclamped threads. Similarly, in the
 834                  * presence of boosting, the programmer doesn't know what other actors
 835                  * are boosting the thread.
 836                  */
 837                 if ((requested.thrp_qos != THREAD_QOS_UNSPECIFIED) &&
 838                     (requested.thrp_qos == next.thep_qos) &&
 839                     (requested.thrp_qos_override == THREAD_QOS_UNSPECIFIED)) {
 840                         next.thep_qos_relprio = requested.thrp_qos_relprio;
 841                 } else {
 842                         next.thep_qos_relprio = 0;
 843                 }
 844         }
 845
 846         /* Calculate DARWIN_BG */
 847         boolean_t wants_darwinbg        = FALSE;
 848         boolean_t wants_all_sockets_bg  = FALSE; /* Do I want my existing sockets to be bg */
 849         boolean_t wants_watchersbg      = FALSE; /* Do I want my pidbound threads to be bg */
 850         boolean_t wants_tal             = FALSE; /* Do I want the effects of TAL mode */
 851
 852         /*
 853          * If DARWIN_BG has been requested at either level, it's engaged.
 854          * Only true DARWIN_BG changes cause watchers to transition.
 855          *
 856          * Backgrounding due to apptype does.
 857          */
 858         if (requested.int_darwinbg || requested.ext_darwinbg)
 859                 wants_watchersbg = wants_all_sockets_bg = wants_darwinbg = TRUE;
 860
 861         if (on_task) {
 862                 /* Background TAL apps are throttled when TAL is enabled */
 863                 if (requested.t_apptype      == TASK_APPTYPE_APP_TAL &&
 864                     requested.t_role         == TASK_BACKGROUND_APPLICATION &&
 865                     requested.t_tal_enabled  == 1) {
 866                         wants_tal = TRUE;
 867                         next.t_tal_engaged = 1;
 868                 }
 869
 870                 /* Adaptive daemons are DARWIN_BG unless boosted, and don't get network throttled. */
 871                 if (requested.t_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE &&
 872                     requested.t_boosted == 0)
 873                         wants_darwinbg = TRUE;
 874
 875                 /* Background daemons are always DARWIN_BG, no exceptions, and don't get network throttled. */
 876                 if (requested.t_apptype == TASK_APPTYPE_DAEMON_BACKGROUND)
 877                         wants_darwinbg = TRUE;
 878
 879                 if (next.t_qos_clamp == THREAD_QOS_BACKGROUND || next.t_qos_clamp == THREAD_QOS_MAINTENANCE)
 880                         wants_darwinbg = TRUE;
 881         } else {
 882                 if (requested.th_pidbind_bg)
 883                         wants_all_sockets_bg = wants_darwinbg = TRUE;
 884
 885                 if (requested.th_workq_bg)
 886                         wants_darwinbg = TRUE;
 887
 888                 if (next.thep_qos == THREAD_QOS_BACKGROUND || next.thep_qos == THREAD_QOS_MAINTENANCE)
 889                         wants_darwinbg = TRUE;
 890         }
 891
 892         /* Calculate side effects of DARWIN_BG */
 893
 894         if (wants_darwinbg) {
 895                 next.darwinbg = 1;
 896                 /* darwinbg threads/tasks always create bg sockets, but we don't always loop over all sockets */
 897                 next.new_sockets_bg = 1;
 898                 next.lowpri_cpu = 1;
 899         }
 900
 901         if (wants_all_sockets_bg)
 902                 next.all_sockets_bg = 1;
 903
 904         if (on_task && wants_watchersbg)
 905                 next.t_watchers_bg = 1;
 906
 907         /* darwinbg on either task or thread implies background QOS (or lower) */
 908         if (!on_task &&
 909                 (wants_darwinbg || task_effective.darwinbg) &&
 910                 (next.thep_qos > THREAD_QOS_BACKGROUND || next.thep_qos == THREAD_QOS_UNSPECIFIED)){
 911                 next.thep_qos = THREAD_QOS_BACKGROUND;
 912                 next.thep_qos_relprio = 0;
 913         }
 914
 915         /* Calculate low CPU priority */
 916
 917         boolean_t wants_lowpri_cpu = FALSE;
 918
 919         if (wants_darwinbg || wants_tal)
 920                 wants_lowpri_cpu = TRUE;
 921
 922         if (on_task && requested.t_sup_lowpri_cpu && requested.t_boosted == 0)
 923                 wants_lowpri_cpu = TRUE;
 924
 925         if (wants_lowpri_cpu)
 926                 next.lowpri_cpu = 1;
 927
 928         /* Calculate IO policy */
 929
 930         /* Update BG IO policy (so we can see if it has changed) */
 931         next.bg_iotier = requested.bg_iotier;
 932
 933         int iopol = THROTTLE_LEVEL_TIER0;
 934
 935         if (wants_darwinbg)
 936                 iopol = MAX(iopol, requested.bg_iotier);
 937
 938         if (on_task) {
 939                 if (requested.t_apptype == TASK_APPTYPE_DAEMON_STANDARD)
 940                         iopol = MAX(iopol, proc_standard_daemon_tier);
 941
 942                 if (requested.t_sup_disk && requested.t_boosted == 0)
 943                         iopol = MAX(iopol, proc_suppressed_disk_tier);
 944
 945                 if (wants_tal)
 946                         iopol = MAX(iopol, proc_tal_disk_tier);
 947
 948                 if (next.t_qos_clamp != THREAD_QOS_UNSPECIFIED)
 949                         iopol = MAX(iopol, thread_qos_policy_params.qos_iotier[next.t_qos_clamp]);
 950
 951         } else {
 952                 /* Look up the associated IO tier value for the QoS class */
 953                 iopol = MAX(iopol, thread_qos_policy_params.qos_iotier[next.thep_qos]);
 954         }
 955
 956         iopol = MAX(iopol, requested.int_iotier);
 957         iopol = MAX(iopol, requested.ext_iotier);
 958
 959         next.io_tier = iopol;
 960
 961         /* Calculate Passive IO policy */
 962
 963         if (requested.ext_iopassive || requested.int_iopassive)
 964                 next.io_passive = 1;
 965
 966         /* Calculate miscellaneous policy */
 967
 968         if (on_task) {
 969                 /* Calculate suppression-active flag */
 970                 if (requested.t_sup_active && requested.t_boosted == 0)
 971                         next.t_sup_active = 1;
 972
 973                 /* Calculate suspend policy */
 974                 if (requested.t_sup_suspend && requested.t_boosted == 0)
 975                         next.t_suspended = 1;
 976
 977                 /* Calculate timer QOS */
 978                 int latency_qos = requested.t_base_latency_qos;
 979
 980                 if (requested.t_sup_timer && requested.t_boosted == 0)
 981                         latency_qos = requested.t_sup_timer;
 982
 983                 if (next.t_qos_clamp != THREAD_QOS_UNSPECIFIED)
 984                         latency_qos = MAX(latency_qos, (int)thread_qos_policy_params.qos_latency_qos[next.t_qos_clamp]);
 985
 986                 if (requested.t_over_latency_qos != 0)
 987                         latency_qos = requested.t_over_latency_qos;
 988
 989                 /* Treat the windowserver special */
 990                 if (requested.t_role == TASK_GRAPHICS_SERVER)
 991                         latency_qos = proc_graphics_timer_qos;
 992
 993                 next.t_latency_qos = latency_qos;
 994
 995                 /* Calculate throughput QOS */
 996                 int through_qos = requested.t_base_through_qos;
 997
 998                 if (requested.t_sup_throughput && requested.t_boosted == 0)
 999                         through_qos = requested.t_sup_throughput;
1000
1001                 if (next.t_qos_clamp != THREAD_QOS_UNSPECIFIED)
1002                         through_qos = MAX(through_qos, (int)thread_qos_policy_params.qos_through_qos[next.t_qos_clamp]);
1003
1004                 if (requested.t_over_through_qos != 0)
1005                         through_qos = requested.t_over_through_qos;
1006
1007                 next.t_through_qos = through_qos;
1008
1009                 /* Calculate suppressed CPU priority */
1010                 if (requested.t_sup_cpu && requested.t_boosted == 0)
1011                         next.t_suppressed_cpu = 1;
1012
1013                 /*
1014                  * Calculate background sockets
1015                  * Don't take into account boosting to limit transition frequency.
1016                  */
1017                 if (requested.t_sup_bg_sockets){
1018                         next.all_sockets_bg = 1;
1019                         next.new_sockets_bg = 1;
1020                 }
1021
1022                 /* Apply SFI Managed class bit */
1023                 next.t_sfi_managed = requested.t_sfi_managed;
1024
1025                 /* Calculate 'live donor' status for live importance */
1026                 switch (requested.t_apptype) {
1027                         case TASK_APPTYPE_APP_TAL:
1028                         case TASK_APPTYPE_APP_DEFAULT:
1029                                 if (requested.ext_darwinbg == 0)
1030                                         next.t_live_donor = 1;
1031                                 else
1032                                         next.t_live_donor = 0;
1033                                 break;
1034
1035                         case TASK_APPTYPE_DAEMON_INTERACTIVE:
1036                         case TASK_APPTYPE_DAEMON_STANDARD:
1037                         case TASK_APPTYPE_DAEMON_ADAPTIVE:
1038                         case TASK_APPTYPE_DAEMON_BACKGROUND:
1039                         default:
1040                                 next.t_live_donor = 0;
1041                                 break;
1042                 }
1043         }
1044
1045         if (requested.terminated) {
1046                 /*
1047                  * Shoot down the throttles that slow down exit or response to SIGTERM
1048                  * We don't need to shoot down:
1049                  * passive        (don't want to cause others to throttle)
1050                  * all_sockets_bg (don't need to iterate FDs on every exit)
1051                  * new_sockets_bg (doesn't matter for exiting process)
1052                  * pidsuspend     (jetsam-ed BG process shouldn't run again)
1053                  * watchers_bg    (watcher threads don't need to be unthrottled)
1054                  * t_latency_qos  (affects userspace timers only)
1055                  */
1056
1057                 next.terminated         = 1;
1058                 next.darwinbg           = 0;
1059                 next.lowpri_cpu         = 0;
1060                 next.io_tier            = THROTTLE_LEVEL_TIER0;
1061                 if (on_task) {
1062                         next.t_tal_engaged = 0;
1063                         next.t_role = TASK_UNSPECIFIED;
1064                         next.t_suppressed_cpu = 0;
1065
1066                         /* TODO: This should only be shot down on SIGTERM, not exit */
1067                         next.t_suspended   = 0;
1068                 } else {
1069                         next.thep_qos = 0;
1070                 }
1071         }
1072
1073         /*
1074          * Step 3:
1075          *  Swap out old policy for new policy
1076          */
1077
1078         if (!on_task) {
1079                 /* Acquire thread mutex to synchronize against
1080                  * thread_policy_set(). Consider reworking to separate qos
1081                  * fields, or locking the task in thread_policy_set.
1082                  * A more efficient model would be to make the thread bits
1083                  * authoritative.
1084                  */
1085                 thread_mtx_lock(thread);
1086         }
1087
1088         struct task_effective_policy prev =
1089                 (on_task) ? task->effective_policy : thread->effective_policy;
1090
1091         /*
1092          * Check for invalid transitions here for easier debugging
1093          * TODO: dump the structs as hex in the panic string
1094          */
1095         if (task == kernel_task && prev.all_sockets_bg != next.all_sockets_bg)
1096                 panic("unexpected network change for kernel task");
1097
1098         /* This is the point where the new values become visible to other threads */
1099         if (on_task)
1100                 task->effective_policy = next;
1101         else {
1102                 /* Preserve thread specific latency/throughput QoS modified via
1103                  * thread_policy_set(). Inelegant in the extreme, to be reworked.
1104                  *
1105                  * If thread QoS class is set, we don't need to preserve the previously set values.
1106                  * We should ensure to not accidentally preserve previous thread QoS values if you set a thread
1107                  * back to default QoS.
1108                  */
1109                 uint32_t lqos = thread->effective_policy.t_latency_qos, tqos = thread->effective_policy.t_through_qos;
1110
1111                 if (prev.thep_qos == THREAD_QOS_UNSPECIFIED && next.thep_qos == THREAD_QOS_UNSPECIFIED) {
1112                         next.t_latency_qos = lqos;
1113                         next.t_through_qos = tqos;
1114                 } else if (prev.thep_qos != THREAD_QOS_UNSPECIFIED && next.thep_qos == THREAD_QOS_UNSPECIFIED) {
1115                         next.t_latency_qos = 0;
1116                         next.t_through_qos = 0;
1117                 } else {
1118                         next.t_latency_qos = thread_qos_policy_params.qos_latency_qos[next.thep_qos];
1119                         next.t_through_qos = thread_qos_policy_params.qos_through_qos[next.thep_qos];
1120                 }
1121
1122                 thread_update_qos_cpu_time(thread, TRUE);
1123                 thread->effective_policy = next;
1124                 thread_mtx_unlock(thread);
1125         }
1126
1127         /* Don't do anything further to a half-formed task or thread */
1128         if (in_create)
1129                 return;
1130
1131         /*
1132          * Step 4:
1133          *  Pend updates that can't be done while holding the task lock
1134          */
1135
1136         if (prev.all_sockets_bg != next.all_sockets_bg)
1137                 pend_token->tpt_update_sockets = 1;
1138
1139         if (on_task) {
1140                 /* Only re-scan the timer list if the qos level is getting less strong */
1141                 if (prev.t_latency_qos > next.t_latency_qos)
1142                         pend_token->tpt_update_timers = 1;
1143
1144
1145                 if (prev.t_live_donor != next.t_live_donor)
1146                         pend_token->tpt_update_live_donor = 1;
1147         }
1148
1149         /*
1150          * Step 5:
1151          *  Update other subsystems as necessary if something has changed
1152          */
1153
1154         boolean_t update_throttle = (prev.io_tier != next.io_tier) ? TRUE : FALSE;
1155
1156         if (on_task) {
1157                 if (prev.t_suspended == 0 && next.t_suspended == 1 && task->active) {
1158                         task_hold_locked(task);
1159                         task_wait_locked(task, FALSE);
1160                 }
1161                 if (prev.t_suspended == 1 && next.t_suspended == 0 && task->active) {
1162                         task_release_locked(task);
1163                 }
1164
1165                 boolean_t update_threads = FALSE;
1166                 boolean_t update_sfi = FALSE;
1167
1168                 if (prev.bg_iotier          != next.bg_iotier        ||
1169                     prev.terminated         != next.terminated       ||
1170                     prev.t_qos_clamp        != next.t_qos_clamp      ||
1171                     prev.t_qos_ceiling      != next.t_qos_ceiling    ||
1172                     prev.qos_ui_is_urgent   != next.qos_ui_is_urgent ||
1173                     prev.darwinbg           != next.darwinbg)
1174                         update_threads = TRUE;
1175
1176                 /*
1177                  * A bit of a layering violation. We know what task policy attributes
1178                  * sfi_thread_classify() consults, so if they change, trigger SFI
1179                  * re-evaluation.
1180                  */
1181                 if ((prev.t_latency_qos != next.t_latency_qos) ||
1182                         (prev.t_role != next.t_role) ||
1183                         (prev.darwinbg != next.darwinbg) ||
1184                         (prev.t_sfi_managed != next.t_sfi_managed))
1185                         update_sfi = TRUE;
1186
1187 /* TODO: if CONFIG_SFI */
1188                 if (prev.t_role != next.t_role && task_policy_update_coalition_focal_tasks(task, prev.t_role, next.t_role)) {
1189                         update_sfi = TRUE;
1190                         pend_token->tpt_update_coal_sfi = 1;
1191                 }
1192
1193                 task_policy_update_task_locked(task, update_throttle, update_threads, update_sfi);
1194         } else {
1195                 int update_cpu = 0;
1196                 boolean_t update_sfi = FALSE;
1197                 boolean_t update_qos = FALSE;
1198
1199                 if (prev.lowpri_cpu != next.lowpri_cpu)
1200                         update_cpu = (next.lowpri_cpu ? DO_LOWPRI_CPU : UNDO_LOWPRI_CPU);
1201
1202                 if (prev.darwinbg != next.darwinbg ||
1203                     prev.thep_qos != next.thep_qos)
1204                         update_sfi = TRUE;
1205
1206                 if (prev.thep_qos           != next.thep_qos          ||
1207                     prev.thep_qos_relprio   != next.thep_qos_relprio  ||
1208                     prev.qos_ui_is_urgent   != next.qos_ui_is_urgent) {
1209                         update_qos = TRUE;
1210                 }
1211
1212                 task_policy_update_thread_locked(thread, update_cpu, update_throttle, update_sfi, update_qos);
1213         }
1214 }
1215
1216 /*
1217  * Yet another layering violation. We reach out and bang on the coalition directly.
1218  */
1219 static boolean_t
1220 task_policy_update_coalition_focal_tasks(task_t     task,
1221                                          int        prev_role,
1222                                          int        next_role)
1223 {
1224         boolean_t sfi_transition = FALSE;
1225
1226         if (prev_role != TASK_FOREGROUND_APPLICATION && next_role == TASK_FOREGROUND_APPLICATION) {
1227                 if (coalition_adjust_focal_task_count(task->coalition, 1) == 1)
1228                         sfi_transition = TRUE;
1229         } else if (prev_role == TASK_FOREGROUND_APPLICATION && next_role != TASK_FOREGROUND_APPLICATION) {
1230                 if (coalition_adjust_focal_task_count(task->coalition, -1) == 0)
1231                         sfi_transition = TRUE;
1232         }
1233
1234         if (prev_role != TASK_BACKGROUND_APPLICATION && next_role == TASK_BACKGROUND_APPLICATION) {
1235                 if (coalition_adjust_non_focal_task_count(task->coalition, 1) == 1)
1236                         sfi_transition = TRUE;
1237         } else if (prev_role == TASK_BACKGROUND_APPLICATION && next_role != TASK_BACKGROUND_APPLICATION) {
1238                 if (coalition_adjust_non_focal_task_count(task->coalition, -1) == 0)
1239                         sfi_transition = TRUE;
1240         }
1241
1242         return sfi_transition;
1243 }
1244
1245 /* Despite the name, the thread's task is locked, the thread is not */
1246 void
1247 task_policy_update_thread_locked(thread_t thread,
1248                                  int update_cpu,
1249                                  boolean_t update_throttle,
1250                                  boolean_t update_sfi,
1251                                  boolean_t update_qos)
1252 {
1253         thread_precedence_policy_data_t policy;
1254
1255         if (update_throttle) {
1256                 rethrottle_thread(thread->uthread);
1257         }
1258
1259         if (update_sfi) {
1260                 sfi_reevaluate(thread);
1261         }
1262
1263         /*
1264          * TODO: pidbind needs to stuff remembered importance into saved_importance
1265          * properly deal with bg'ed threads being pidbound and unbging while pidbound
1266          *
1267          * TODO: A BG thread's priority is 0 on desktop and 4 on embedded.  Need to reconcile this.
1268          * */
1269         if (update_cpu == DO_LOWPRI_CPU) {
1270                 thread->saved_importance = thread->importance;
1271                 policy.importance = INT_MIN;
1272         } else if (update_cpu == UNDO_LOWPRI_CPU) {
1273                 policy.importance = thread->saved_importance;
1274                 thread->saved_importance = 0;
1275         }
1276
1277         /* Takes thread lock and thread mtx lock */
1278         if (update_cpu)
1279                 thread_policy_set_internal(thread, THREAD_PRECEDENCE_POLICY,
1280                                            (thread_policy_t)&policy,
1281                                            THREAD_PRECEDENCE_POLICY_COUNT);
1282
1283         if (update_qos)
1284                 thread_recompute_qos(thread);
1285 }
1286
1287 /*
1288  * Calculate priority on a task, loop through its threads, and tell them about
1289  * priority changes and throttle changes.
1290  */
1291 void
1292 task_policy_update_task_locked(task_t    task,
1293                                boolean_t update_throttle,
1294                                boolean_t update_threads,
1295                                boolean_t update_sfi)
1296 {
1297         boolean_t update_priority = FALSE;
1298
1299         if (task == kernel_task)
1300                 panic("Attempting to set task policy on kernel_task");
1301
1302         int priority     = BASEPRI_DEFAULT;
1303         int max_priority = MAXPRI_USER;
1304
1305         if (proc_get_effective_task_policy(task, TASK_POLICY_LOWPRI_CPU)) {
1306                 priority = MAXPRI_THROTTLE;
1307                 max_priority = MAXPRI_THROTTLE;
1308         } else if (proc_get_effective_task_policy(task, TASK_POLICY_SUPPRESSED_CPU)) {
1309                 priority = MAXPRI_SUPPRESSED;
1310                 max_priority = MAXPRI_SUPPRESSED;
1311         } else {
1312                 switch (proc_get_effective_task_policy(task, TASK_POLICY_ROLE)) {
1313                         case TASK_CONTROL_APPLICATION:
1314                                 priority = BASEPRI_CONTROL;
1315                                 break;
1316                         case TASK_GRAPHICS_SERVER:
1317                                 priority = BASEPRI_GRAPHICS;
1318                                 max_priority = MAXPRI_RESERVED;
1319                                 break;
1320                         default:
1321                                 break;
1322                 }
1323
1324                 /* factor in 'nice' value */
1325                 priority += task->importance;
1326
1327                 if (task->effective_policy.t_qos_clamp != THREAD_QOS_UNSPECIFIED) {
1328                         int qos_clamp_priority = thread_qos_policy_params.qos_pri[task->effective_policy.t_qos_clamp];
1329
1330                         priority        = MIN(priority, qos_clamp_priority);
1331                         max_priority    = MIN(max_priority, qos_clamp_priority);
1332                 }
1333         }
1334
1335         /* avoid extra work if priority isn't changing */
1336         if (task->priority != priority || task->max_priority != max_priority) {
1337                 update_priority = TRUE;
1338
1339                 /* update the scheduling priority for the task */
1340                 task->max_priority = max_priority;
1341
1342                 if (priority > task->max_priority)
1343                         priority = task->max_priority;
1344                 else if (priority < MINPRI)
1345                         priority = MINPRI;
1346
1347                 task->priority = priority;
1348         }
1349
1350         /* Loop over the threads in the task only once, and only if necessary */
1351         if (update_threads || update_throttle || update_priority || update_sfi ) {
1352                 thread_t thread;
1353
1354                 queue_iterate(&task->threads, thread, thread_t, task_threads) {
1355                         if (update_priority) {
1356                                 thread_mtx_lock(thread);
1357
1358                                 thread_task_priority(thread, priority, max_priority);
1359
1360                                 thread_mtx_unlock(thread);
1361                         }
1362
1363                         if (update_throttle) {
1364                                 rethrottle_thread(thread->uthread);
1365                         }
1366
1367                         if (update_sfi) {
1368                                 sfi_reevaluate(thread);
1369                         }
1370
1371                         if (update_threads) {
1372                                 thread->requested_policy.bg_iotier  = task->effective_policy.bg_iotier;
1373                                 thread->requested_policy.terminated = task->effective_policy.terminated;
1374
1375                                 task_policy_update_internal_locked(task, thread, FALSE, NULL);
1376                                 /*  The thread policy must not emit any completion actions due to this change. */
1377                         }
1378                 }
1379         }
1380 }
1381
1382 /*
1383  * Called with task unlocked to do things that can't be done while holding the task lock
1384  */
1385 void
1386 task_policy_update_complete_unlocked(task_t task, thread_t thread, task_pend_token_t pend_token)
1387 {
1388         boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE;
1389
1390 #ifdef MACH_BSD
1391         if (pend_token->tpt_update_sockets)
1392                 proc_apply_task_networkbg(task->bsd_info, thread);
1393 #endif /* MACH_BSD */
1394
1395         if (on_task) {
1396                 /* The timer throttle has been removed or reduced, we need to look for expired timers and fire them */
1397                 if (pend_token->tpt_update_timers)
1398                         ml_timer_evaluate();
1399
1400
1401                 if (pend_token->tpt_update_live_donor)
1402                         task_importance_update_live_donor(task);
1403
1404                 if (pend_token->tpt_update_coal_sfi)
1405                         coalition_sfi_reevaluate(task->coalition, task);
1406         }
1407 }
1408
1409 /*
1410  * Initiate a task policy state transition
1411  *
1412  * Everything that modifies requested except functions that need to hold the task lock
1413  * should use this function
1414  *
1415  * Argument validation should be performed before reaching this point.
1416  *
1417  * TODO: Do we need to check task->active or thread->active?
1418  */
1419 void
1420 proc_set_task_policy(task_t     task,
1421                      thread_t   thread,
1422                      int        category,
1423                      int        flavor,
1424                      int        value)
1425 {
1426         struct task_pend_token pend_token = {};
1427
1428         task_lock(task);
1429
1430         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1431                                   (IMPORTANCE_CODE(flavor, (category | tisthread(thread)))) | DBG_FUNC_START,
1432                                   targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), value, 0);
1433
1434         proc_set_task_policy_locked(task, thread, category, flavor, value);
1435
1436         task_policy_update_locked(task, thread, &pend_token);
1437
1438         task_unlock(task);
1439
1440         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1441                                   (IMPORTANCE_CODE(flavor, (category | tisthread(thread)))) | DBG_FUNC_END,
1442                                   targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), tpending(&pend_token), 0);
1443
1444         task_policy_update_complete_unlocked(task, thread, &pend_token);
1445 }
1446
1447 /*
1448  * Initiate a task policy state transition on a thread with its TID
1449  * Useful if you cannot guarantee the thread won't get terminated
1450  */
1451 void
1452 proc_set_task_policy_thread(task_t     task,
1453                             uint64_t   tid,
1454                             int        category,
1455                             int        flavor,
1456                             int        value)
1457 {
1458         thread_t thread;
1459         thread_t self = current_thread();
1460         struct task_pend_token pend_token = {};
1461
1462         task_lock(task);
1463
1464         if (tid == TID_NULL || tid == self->thread_id)
1465                 thread = self;
1466         else
1467                 thread = task_findtid(task, tid);
1468
1469         if (thread == THREAD_NULL) {
1470                 task_unlock(task);
1471                 return;
1472         }
1473
1474         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1475                                   (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_THREAD))) | DBG_FUNC_START,
1476                                   targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), value, 0);
1477
1478         proc_set_task_policy_locked(task, thread, category, flavor, value);
1479
1480         task_policy_update_locked(task, thread, &pend_token);
1481
1482         task_unlock(task);
1483
1484         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1485                                   (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_THREAD))) | DBG_FUNC_END,
1486                                   targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), tpending(&pend_token), 0);
1487
1488         task_policy_update_complete_unlocked(task, thread, &pend_token);
1489 }
1490
1491 /*
1492  * Variant of proc_set_task_policy() that sets two scalars in the requested policy structure.
1493  * Same locking rules apply.
1494  */
1495 void
1496 proc_set_task_policy2(task_t task, thread_t thread, int category, int flavor, int value1, int value2)
1497 {
1498         struct task_pend_token pend_token = {};
1499
1500         task_lock(task);
1501
1502         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1503                                   (IMPORTANCE_CODE(flavor, (category | tisthread(thread)))) | DBG_FUNC_START,
1504                                   targetid(task, thread), trequested_0(task, thread), trequested_1(task, thread), value1, 0);
1505
1506         proc_set_task_policy2_locked(task, thread, category, flavor, value1, value2);
1507
1508         task_policy_update_locked(task, thread, &pend_token);
1509
1510         task_unlock(task);
1511
1512         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1513                                   (IMPORTANCE_CODE(flavor, (category | tisthread(thread)))) | DBG_FUNC_END,
1514                                   targetid(task, thread), trequested_0(task, thread), trequested_0(task, thread), tpending(&pend_token), 0);
1515
1516         task_policy_update_complete_unlocked(task, thread, &pend_token);
1517 }
1518
1519 /*
1520  * Set the requested state for a specific flavor to a specific value.
1521  *
1522  *  TODO:
1523  *  Verify that arguments to non iopol things are 1 or 0
1524  */
1525 static void
1526 proc_set_task_policy_locked(task_t      task,
1527                             thread_t    thread,
1528                             int         category,
1529                             int         flavor,
1530                             int         value)
1531 {
1532         boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE;
1533
1534         int tier, passive;
1535
1536         struct task_requested_policy requested =
1537                 (on_task) ? task->requested_policy : thread->requested_policy;
1538
1539         switch (flavor) {
1540
1541         /* Category: EXTERNAL and INTERNAL, thread and task */
1542
1543                 case TASK_POLICY_DARWIN_BG:
1544                         if (category == TASK_POLICY_EXTERNAL)
1545                                 requested.ext_darwinbg = value;
1546                         else
1547                                 requested.int_darwinbg = value;
1548                         break;
1549
1550                 case TASK_POLICY_IOPOL:
1551                         proc_iopol_to_tier(value, &tier, &passive);
1552                         if (category == TASK_POLICY_EXTERNAL) {
1553                                 requested.ext_iotier  = tier;
1554                                 requested.ext_iopassive = passive;
1555                         } else {
1556                                 requested.int_iotier  = tier;
1557                                 requested.int_iopassive = passive;
1558                         }
1559                         break;
1560
1561                 case TASK_POLICY_IO:
1562                         if (category == TASK_POLICY_EXTERNAL)
1563                                 requested.ext_iotier = value;
1564                         else
1565                                 requested.int_iotier = value;
1566                         break;
1567
1568                 case TASK_POLICY_PASSIVE_IO:
1569                         if (category == TASK_POLICY_EXTERNAL)
1570                                 requested.ext_iopassive = value;
1571                         else
1572                                 requested.int_iopassive = value;
1573                         break;
1574
1575         /* Category: INTERNAL, task only */
1576
1577                 case TASK_POLICY_DARWIN_BG_IOPOL:
1578                         assert(on_task && category == TASK_POLICY_INTERNAL);
1579                         proc_iopol_to_tier(value, &tier, &passive);
1580                         requested.bg_iotier = tier;
1581                         break;
1582
1583         /* Category: ATTRIBUTE, task only */
1584
1585                 case TASK_POLICY_TAL:
1586                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1587                         requested.t_tal_enabled = value;
1588                         break;
1589
1590                 case TASK_POLICY_BOOST:
1591                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1592                         requested.t_boosted = value;
1593                         break;
1594
1595                 case TASK_POLICY_ROLE:
1596                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1597                         requested.t_role = value;
1598                         break;
1599
1600                 case TASK_POLICY_TERMINATED:
1601                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1602                         requested.terminated = value;
1603                         break;
1604                 case TASK_BASE_LATENCY_QOS_POLICY:
1605                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1606                         requested.t_base_latency_qos = value;
1607                         break;
1608                 case TASK_BASE_THROUGHPUT_QOS_POLICY:
1609                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1610                         requested.t_base_through_qos = value;
1611                         break;
1612                 case TASK_POLICY_SFI_MANAGED:
1613                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1614                         requested.t_sfi_managed = value;
1615                         break;
1616
1617         /* Category: ATTRIBUTE, thread only */
1618
1619                 case TASK_POLICY_PIDBIND_BG:
1620                         assert(!on_task && category == TASK_POLICY_ATTRIBUTE);
1621                         requested.th_pidbind_bg = value;
1622                         break;
1623
1624                 case TASK_POLICY_WORKQ_BG:
1625                         assert(!on_task && category == TASK_POLICY_ATTRIBUTE);
1626                         requested.th_workq_bg = value;
1627                         break;
1628
1629                 case TASK_POLICY_QOS:
1630                         assert(!on_task && category == TASK_POLICY_ATTRIBUTE);
1631                         requested.thrp_qos = value;
1632                         break;
1633
1634                 case TASK_POLICY_QOS_OVERRIDE:
1635                         assert(!on_task && category == TASK_POLICY_ATTRIBUTE);
1636                         requested.thrp_qos_override = value;
1637                         break;
1638
1639                 default:
1640                         panic("unknown task policy: %d %d %d", category, flavor, value);
1641                         break;
1642         }
1643
1644         if (on_task)
1645                 task->requested_policy = requested;
1646         else
1647                 thread->requested_policy = requested;
1648 }
1649
1650 /*
1651  * Variant of proc_set_task_policy_locked() that sets two scalars in the requested policy structure.
1652  */
1653 static void
1654 proc_set_task_policy2_locked(task_t      task,
1655                              thread_t    thread,
1656                              int         category,
1657                              int         flavor,
1658                              int         value1,
1659                              int         value2)
1660 {
1661         boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE;
1662
1663         struct task_requested_policy requested =
1664                 (on_task) ? task->requested_policy : thread->requested_policy;
1665
1666         switch (flavor) {
1667
1668         /* Category: ATTRIBUTE, task only */
1669
1670                 case TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS:
1671                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1672                         requested.t_base_latency_qos = value1;
1673                         requested.t_base_through_qos = value2;
1674                         break;
1675
1676                 case TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS:
1677                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1678                         requested.t_over_latency_qos = value1;
1679                         requested.t_over_through_qos = value2;
1680                         break;
1681
1682         /* Category: ATTRIBUTE, thread only */
1683
1684                 case TASK_POLICY_QOS_AND_RELPRIO:
1685
1686                         assert(!on_task && category == TASK_POLICY_ATTRIBUTE);
1687                         requested.thrp_qos = value1;
1688                         requested.thrp_qos_relprio = value2;
1689                         DTRACE_BOOST3(qos_set, uint64_t, thread->thread_id, int, requested.thrp_qos, int, requested.thrp_qos_relprio);
1690                         break;
1691
1692                 default:
1693                         panic("unknown task policy: %d %d %d %d", category, flavor, value1, value2);
1694                         break;
1695         }
1696
1697         if (on_task)
1698                 task->requested_policy = requested;
1699         else
1700                 thread->requested_policy = requested;
1701 }
1702
1703
1704 /*
1705  * Gets what you set. Effective values may be different.
1706  */
1707 int
1708 proc_get_task_policy(task_t     task,
1709                      thread_t   thread,
1710                      int        category,
1711                      int        flavor)
1712 {
1713         boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE;
1714
1715         int value = 0;
1716
1717         task_lock(task);
1718
1719         struct task_requested_policy requested =
1720                 (on_task) ? task->requested_policy : thread->requested_policy;
1721
1722         switch (flavor) {
1723                 case TASK_POLICY_DARWIN_BG:
1724                         if (category == TASK_POLICY_EXTERNAL)
1725                                 value = requested.ext_darwinbg;
1726                         else
1727                                 value = requested.int_darwinbg;
1728                         break;
1729                 case TASK_POLICY_IOPOL:
1730                         if (category == TASK_POLICY_EXTERNAL)
1731                                 value = proc_tier_to_iopol(requested.ext_iotier,
1732                                                             requested.ext_iopassive);
1733                         else
1734                                 value = proc_tier_to_iopol(requested.int_iotier,
1735                                                             requested.int_iopassive);
1736                         break;
1737                 case TASK_POLICY_IO:
1738                         if (category == TASK_POLICY_EXTERNAL)
1739                                 value = requested.ext_iotier;
1740                         else
1741                                 value = requested.int_iotier;
1742                         break;
1743                 case TASK_POLICY_PASSIVE_IO:
1744                         if (category == TASK_POLICY_EXTERNAL)
1745                                 value = requested.ext_iopassive;
1746                         else
1747                                 value = requested.int_iopassive;
1748                         break;
1749                 case TASK_POLICY_DARWIN_BG_IOPOL:
1750                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1751                         value = proc_tier_to_iopol(requested.bg_iotier, 0);
1752                         break;
1753                 case TASK_POLICY_ROLE:
1754                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1755                         value = requested.t_role;
1756                         break;
1757                 case TASK_POLICY_SFI_MANAGED:
1758                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1759                         value = requested.t_sfi_managed;
1760                         break;
1761                 case TASK_POLICY_QOS:
1762                         assert(!on_task && category == TASK_POLICY_ATTRIBUTE);
1763                         value = requested.thrp_qos;
1764                         break;
1765                 case TASK_POLICY_QOS_OVERRIDE:
1766                         assert(!on_task && category == TASK_POLICY_ATTRIBUTE);
1767                         value = requested.thrp_qos_override;
1768                         break;
1769                 default:
1770                         panic("unknown policy_flavor %d", flavor);
1771                         break;
1772         }
1773
1774         task_unlock(task);
1775
1776         return value;
1777 }
1778
1779 /*
1780  * Variant of proc_get_task_policy() that returns two scalar outputs.
1781  */
1782 void
1783 proc_get_task_policy2(task_t task, thread_t thread, int category __unused, int flavor, int *value1, int *value2)
1784 {
1785         boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE;
1786
1787         task_lock(task);
1788
1789         struct task_requested_policy requested =
1790                 (on_task) ? task->requested_policy : thread->requested_policy;
1791
1792         switch (flavor) {
1793                 /* TASK attributes */
1794                 case TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS:
1795                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1796                         *value1 = requested.t_base_latency_qos;
1797                         *value2 = requested.t_base_through_qos;
1798                         break;
1799
1800                 case TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS:
1801                         assert(on_task && category == TASK_POLICY_ATTRIBUTE);
1802                         *value1 = requested.t_over_latency_qos;
1803                         *value2 = requested.t_over_through_qos;
1804                         break;
1805
1806                 /* THREAD attributes */
1807                 case TASK_POLICY_QOS_AND_RELPRIO:
1808                         assert(!on_task && category == TASK_POLICY_ATTRIBUTE);
1809                         *value1 = requested.thrp_qos;
1810                         *value2 = requested.thrp_qos_relprio;
1811                         break;
1812
1813                 default:
1814                         panic("unknown policy_flavor %d", flavor);
1815                         break;
1816         }
1817
1818         task_unlock(task);
1819 }
1820
1821
1822 /*
1823  * Functions for querying effective state for relevant subsystems
1824  * ONLY the relevant subsystem should query these.
1825  * NEVER take a value from one of the 'effective' functions and stuff it into a setter.
1826  */
1827
1828 int
1829 proc_get_effective_task_policy(task_t task, int flavor)
1830 {
1831         return proc_get_effective_policy(task, THREAD_NULL, flavor);
1832 }
1833
1834 int
1835 proc_get_effective_thread_policy(thread_t thread, int flavor)
1836 {
1837         return proc_get_effective_policy(thread->task, thread, flavor);
1838 }
1839
1840 /*
1841  * Gets what is actually in effect, for subsystems which pull policy instead of receive updates.
1842  *
1843  * NOTE: This accessor does not take the task lock.
1844  * Notifications of state updates need to be externally synchronized with state queries.
1845  * This routine *MUST* remain interrupt safe, as it is potentially invoked
1846  * within the context of a timer interrupt.  It is also called in KDP context for stackshot.
1847  */
1848 static int
1849 proc_get_effective_policy(task_t   task,
1850                           thread_t thread,
1851                           int      flavor)
1852 {
1853         boolean_t on_task = (thread == THREAD_NULL) ? TRUE : FALSE;
1854         int value = 0;
1855
1856         switch (flavor) {
1857                 case TASK_POLICY_DARWIN_BG:
1858                         /*
1859                          * This backs the KPI call proc_pidbackgrounded to find
1860                          * out if a pid is backgrounded,
1861                          * as well as proc_get_effective_thread_policy.
1862                          * Its main use is within the timer layer, as well as
1863                          * prioritizing requests to the graphics system.
1864                          * Returns 1 for background mode, 0 for normal mode
1865                          */
1866                         if (on_task)
1867                                 value = task->effective_policy.darwinbg;
1868                         else
1869                                 value = (task->effective_policy.darwinbg ||
1870                                           thread->effective_policy.darwinbg) ? 1 : 0;
1871                         break;
1872                 case TASK_POLICY_IO:
1873                         /*
1874                          * The I/O system calls here to find out what throttling tier to apply to an operation.
1875                          * Returns THROTTLE_LEVEL_* values. Some userspace spinlock operations can apply
1876                          * a temporary iotier override to make the I/O more aggressive to get the lock
1877                          * owner to release the spinlock.
1878                          */
1879                         if (on_task)
1880                                 value = task->effective_policy.io_tier;
1881                         else {
1882                                 value = MAX(task->effective_policy.io_tier,
1883                                              thread->effective_policy.io_tier);
1884                                 if (thread->iotier_override != THROTTLE_LEVEL_NONE)
1885                                         value = MIN(value, thread->iotier_override);
1886                         }
1887                         break;
1888                 case TASK_POLICY_PASSIVE_IO:
1889                         /*
1890                          * The I/O system calls here to find out whether an operation should be passive.
1891                          * (i.e. not cause operations with lower throttle tiers to be throttled)
1892                          * Returns 1 for passive mode, 0 for normal mode.
1893                          * If a userspace spinlock has applied an override, that I/O should always
1894                          * be passive to avoid self-throttling when the override is removed and lower
1895                          * iotier I/Os are issued.
1896                          */
1897                         if (on_task)
1898                                 value = task->effective_policy.io_passive;
1899                         else {
1900                                 int io_tier = MAX(task->effective_policy.io_tier, thread->effective_policy.io_tier);
1901                                 boolean_t override_in_effect = (thread->iotier_override != THROTTLE_LEVEL_NONE) && (thread->iotier_override < io_tier);
1902
1903                                 value = (task->effective_policy.io_passive ||
1904                                           thread->effective_policy.io_passive || override_in_effect) ? 1 : 0;
1905                         }
1906                         break;
1907                 case TASK_POLICY_ALL_SOCKETS_BG:
1908                         /*
1909                          * do_background_socket() calls this to determine what it should do to the proc's sockets
1910                          * Returns 1 for background mode, 0 for normal mode
1911                          *
1912                          * This consults both thread and task so un-DBGing a thread while the task is BG
1913                          * doesn't get you out of the network throttle.
1914                          */
1915                         if (on_task)
1916                                 value = task->effective_policy.all_sockets_bg;
1917                         else
1918                                 value = (task->effective_policy.all_sockets_bg ||
1919                                          thread->effective_policy.all_sockets_bg) ? 1 : 0;
1920                         break;
1921                 case TASK_POLICY_NEW_SOCKETS_BG:
1922                         /*
1923                          * socreate() calls this to determine if it should mark a new socket as background
1924                          * Returns 1 for background mode, 0 for normal mode
1925                          */
1926                         if (on_task)
1927                                 value = task->effective_policy.new_sockets_bg;
1928                         else
1929                                 value = (task->effective_policy.new_sockets_bg ||
1930                                           thread->effective_policy.new_sockets_bg) ? 1 : 0;
1931                         break;
1932                 case TASK_POLICY_LOWPRI_CPU:
1933                         /*
1934                          * Returns 1 for low priority cpu mode, 0 for normal mode
1935                          */
1936                         if (on_task)
1937                                 value = task->effective_policy.lowpri_cpu;
1938                         else
1939                                 value = (task->effective_policy.lowpri_cpu ||
1940                                           thread->effective_policy.lowpri_cpu) ? 1 : 0;
1941                         break;
1942                 case TASK_POLICY_SUPPRESSED_CPU:
1943                         /*
1944                          * Returns 1 for suppressed cpu mode, 0 for normal mode
1945                          */
1946                         assert(on_task);
1947                         value = task->effective_policy.t_suppressed_cpu;
1948                         break;
1949                 case TASK_POLICY_LATENCY_QOS:
1950                         /*
1951                          * timer arming calls into here to find out the timer coalescing level
1952                          * Returns a QoS tier (0-6)
1953                          */
1954                         if (on_task) {
1955                                 value = task->effective_policy.t_latency_qos;
1956                         } else {
1957                                 value = MAX(task->effective_policy.t_latency_qos, thread->effective_policy.t_latency_qos);
1958                         }
1959                         break;
1960                 case TASK_POLICY_THROUGH_QOS:
1961                         /*
1962                          * Returns a QoS tier (0-6)
1963                          */
1964                         assert(on_task);
1965                         value = task->effective_policy.t_through_qos;
1966                         break;
1967                 case TASK_POLICY_ROLE:
1968                         assert(on_task);
1969                         value = task->effective_policy.t_role;
1970                         break;
1971                 case TASK_POLICY_WATCHERS_BG:
1972                         assert(on_task);
1973                         value = task->effective_policy.t_watchers_bg;
1974                         break;
1975                 case TASK_POLICY_SFI_MANAGED:
1976                         assert(on_task);
1977                         value = task->effective_policy.t_sfi_managed;
1978                         break;
1979                 case TASK_POLICY_QOS:
1980                         assert(!on_task);
1981                         value = thread->effective_policy.thep_qos;
1982                         break;
1983                 default:
1984                         panic("unknown policy_flavor %d", flavor);
1985                         break;
1986         }
1987
1988         return value;
1989 }
1990
1991 /*
1992  * Convert from IOPOL_* values to throttle tiers.
1993  *
1994  * TODO: Can this be made more compact, like an array lookup
1995  * Note that it is possible to support e.g. IOPOL_PASSIVE_STANDARD in the future
1996  */
1997
1998 static void
1999 proc_iopol_to_tier(int iopolicy, int *tier, int *passive)
2000 {
2001         *passive = 0;
2002         *tier = 0;
2003         switch (iopolicy) {
2004                 case IOPOL_IMPORTANT:
2005                         *tier = THROTTLE_LEVEL_TIER0;
2006                         break;
2007                 case IOPOL_PASSIVE:
2008                         *tier = THROTTLE_LEVEL_TIER0;
2009                         *passive = 1;
2010                         break;
2011                 case IOPOL_STANDARD:
2012                         *tier = THROTTLE_LEVEL_TIER1;
2013                         break;
2014                 case IOPOL_UTILITY:
2015                         *tier = THROTTLE_LEVEL_TIER2;
2016                         break;
2017                 case IOPOL_THROTTLE:
2018                         *tier = THROTTLE_LEVEL_TIER3;
2019                         break;
2020                 default:
2021                         panic("unknown I/O policy %d", iopolicy);
2022                         break;
2023         }
2024 }
2025
2026 static int
2027 proc_tier_to_iopol(int tier, int passive)
2028 {
2029         if (passive == 1) {
2030                 switch (tier) {
2031                         case THROTTLE_LEVEL_TIER0:
2032                                 return IOPOL_PASSIVE;
2033                                 break;
2034                         default:
2035                                 panic("unknown passive tier %d", tier);
2036                                 return IOPOL_DEFAULT;
2037                                 break;
2038                 }
2039         } else {
2040                 switch (tier) {
2041                         case THROTTLE_LEVEL_NONE:
2042                         case THROTTLE_LEVEL_TIER0:
2043                                 return IOPOL_DEFAULT;
2044                                 break;
2045                         case THROTTLE_LEVEL_TIER1:
2046                                 return IOPOL_STANDARD;
2047                                 break;
2048                         case THROTTLE_LEVEL_TIER2:
2049                                 return IOPOL_UTILITY;
2050                                 break;
2051                         case THROTTLE_LEVEL_TIER3:
2052                                 return IOPOL_THROTTLE;
2053                                 break;
2054                         default:
2055                                 panic("unknown tier %d", tier);
2056                                 return IOPOL_DEFAULT;
2057                                 break;
2058                 }
2059         }
2060 }
2061
2062 /* apply internal backgrounding for workqueue threads */
2063 int
2064 proc_apply_workq_bgthreadpolicy(thread_t thread)
2065 {
2066         if (thread == THREAD_NULL)
2067                 return ESRCH;
2068
2069         proc_set_task_policy(thread->task, thread, TASK_POLICY_ATTRIBUTE,
2070                              TASK_POLICY_WORKQ_BG, TASK_POLICY_ENABLE);
2071
2072         return(0);
2073 }
2074
2075 /*
2076  * remove internal backgrounding for workqueue threads
2077  * does NOT go find sockets created while BG and unbackground them
2078  */
2079 int
2080 proc_restore_workq_bgthreadpolicy(thread_t thread)
2081 {
2082         if (thread == THREAD_NULL)
2083                 return ESRCH;
2084
2085         proc_set_task_policy(thread->task, thread, TASK_POLICY_ATTRIBUTE,
2086                              TASK_POLICY_WORKQ_BG, TASK_POLICY_DISABLE);
2087
2088         return(0);
2089 }
2090
2091 /* here for temporary compatibility */
2092 int
2093 proc_setthread_saved_importance(__unused thread_t thread, __unused int importance)
2094 {
2095         return(0);
2096 }
2097
2098 /*
2099  * Set an override on the thread which is consulted with a
2100  * higher priority than the task/thread policy. This should
2101  * only be set for temporary grants until the thread
2102  * returns to the userspace boundary
2103  *
2104  * We use atomic operations to swap in the override, with
2105  * the assumption that the thread itself can
2106  * read the override and clear it on return to userspace.
2107  *
2108  * No locking is performed, since it is acceptable to see
2109  * a stale override for one loop through throttle_lowpri_io().
2110  * However a thread reference must be held on the thread.
2111  */
2112
2113 void set_thread_iotier_override(thread_t thread, int policy)
2114 {
2115         int current_override;
2116
2117         /* Let most aggressive I/O policy win until user boundary */
2118         do {
2119                 current_override = thread->iotier_override;
2120
2121                 if (current_override != THROTTLE_LEVEL_NONE)
2122                         policy = MIN(current_override, policy);
2123
2124                 if (current_override == policy) {
2125                         /* no effective change */
2126                         return;
2127                 }
2128         } while (!OSCompareAndSwap(current_override, policy, &thread->iotier_override));
2129
2130         /*
2131          * Since the thread may be currently throttled,
2132          * re-evaluate tiers and potentially break out
2133          * of an msleep
2134          */
2135         rethrottle_thread(thread->uthread);
2136 }
2137
2138 /*
2139  * Userspace synchronization routines (like pthread mutexes, pthread reader-writer locks,
2140  * semaphores, dispatch_sync) may result in priority inversions where a higher priority
2141  * (i.e. scheduler priority, I/O tier, QoS tier) is waiting on a resource owned by a lower
2142  * priority thread. In these cases, we attempt to propagate the priority token, as long
2143  * as the subsystem informs us of the relationships between the threads. The userspace
2144  * synchronization subsystem should maintain the information of owner->resource and
2145  * resource->waiters itself.
2146  */
2147
2148 /*
2149  * This helper canonicalizes the resource/resource_type given the current qos_override_mode
2150  * in effect. Note that wildcards (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD) may need
2151  * to be handled specially in the future, but for now it's fine to slam
2152  * *resource to USER_ADDR_NULL even if it was previously a wildcard.
2153  */
2154 static void _canonicalize_resource_and_type(user_addr_t *resource, int *resource_type) {
2155         if (qos_override_mode == QOS_OVERRIDE_MODE_OVERHANG_PEAK || qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) {
2156                 /* Map all input resource/type to a single one */
2157                 *resource = USER_ADDR_NULL;
2158                 *resource_type = THREAD_QOS_OVERRIDE_TYPE_UNKNOWN;
2159         } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE) {
2160                 /* no transform */
2161         } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH) {
2162                 /* Map all dispatch overrides to a single one, to avoid memory overhead */
2163                 if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) {
2164                         *resource = USER_ADDR_NULL;
2165                 }
2166         } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE) {
2167                 /* Map all mutex overrides to a single one, to avoid memory overhead */
2168                 if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX) {
2169                         *resource = USER_ADDR_NULL;
2170                 }
2171         }
2172 }
2173
2174 /* This helper routine finds an existing override if known. Locking should be done by caller */
2175 static struct thread_qos_override *_find_qos_override(thread_t thread, user_addr_t resource, int resource_type) {
2176         struct thread_qos_override *override;
2177
2178         override = thread->overrides;
2179         while (override) {
2180                 if (override->override_resource == resource &&
2181                         override->override_resource_type == resource_type) {
2182                         return override;
2183                 }
2184
2185                 override = override->override_next;
2186         }
2187
2188         return NULL;
2189 }
2190
2191 static void _find_and_decrement_qos_override(thread_t thread, user_addr_t resource, int resource_type, boolean_t reset, struct thread_qos_override **free_override_list) {
2192         struct thread_qos_override *override, *override_prev;
2193
2194         override_prev = NULL;
2195         override = thread->overrides;
2196         while (override) {
2197                 struct thread_qos_override *override_next = override->override_next;
2198
2199                 if ((THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD == resource || override->override_resource == resource) &&
2200                         override->override_resource_type == resource_type) {
2201                         if (reset) {
2202                                 override->override_contended_resource_count = 0;
2203                         } else {
2204                                 override->override_contended_resource_count--;
2205                         }
2206
2207                         if (override->override_contended_resource_count == 0) {
2208                                 if (override_prev == NULL) {
2209                                         thread->overrides = override_next;
2210                                 } else {
2211                                         override_prev->override_next = override_next;
2212                                 }
2213
2214                                 /* Add to out-param for later zfree */
2215                                 override->override_next = *free_override_list;
2216                                 *free_override_list = override;
2217                         } else {
2218                                 override_prev = override;
2219                         }
2220
2221                         if (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD != resource) {
2222                                 return;
2223                         }
2224                 } else {
2225                         override_prev = override;
2226                 }
2227
2228                 override = override_next;
2229         }
2230 }
2231
2232 /* This helper recalculates the current requested override using the policy selected at boot */
2233 static int _calculate_requested_qos_override(thread_t thread)
2234 {
2235         if (qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) {
2236                 return THREAD_QOS_UNSPECIFIED;
2237         }
2238
2239         /* iterate over all overrides and calculate MAX */
2240         struct thread_qos_override *override;
2241         int qos_override = THREAD_QOS_UNSPECIFIED;
2242
2243         override = thread->overrides;
2244         while (override) {
2245                 if (qos_override_mode != QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH ||
2246                         override->override_resource_type != THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) {
2247                         qos_override = MAX(qos_override, override->override_qos);
2248                 }
2249
2250                 override = override->override_next;
2251         }
2252
2253         return qos_override;
2254 }
2255
2256 boolean_t proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid, int override_qos, boolean_t first_override_for_resource, user_addr_t resource, int resource_type)
2257 {
2258         thread_t        self = current_thread();
2259         struct task_pend_token pend_token = {};
2260
2261         /* XXX move to thread mutex when thread policy does */
2262         task_lock(task);
2263
2264         /*
2265          * If thread is passed, it is assumed to be most accurate, since the caller must have an explicit (or implicit) reference
2266          * to the thread
2267          */
2268
2269         if (thread != THREAD_NULL) {
2270                 assert(task == thread->task);
2271         } else {
2272                 if (tid == self->thread_id) {
2273                         thread = self;
2274                 } else {
2275                         thread = task_findtid(task, tid);
2276
2277                         if (thread == THREAD_NULL) {
2278                                 KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_NONE,
2279                                                                           tid, 0, 0xdead, 0, 0);
2280                                 task_unlock(task);
2281                                 return FALSE;
2282                         }
2283                 }
2284         }
2285
2286         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_START,
2287                                                   thread_tid(thread), override_qos, first_override_for_resource ? 1 : 0, 0, 0);
2288
2289         DTRACE_BOOST5(qos_add_override_pre, uint64_t, tid, uint64_t, thread->requested_policy.thrp_qos,
2290                 uint64_t, thread->effective_policy.thep_qos, int, override_qos, boolean_t, first_override_for_resource);
2291
2292         struct task_requested_policy requested = thread->requested_policy;
2293         struct thread_qos_override *override;
2294         struct thread_qos_override *deferred_free_override = NULL;
2295         int new_qos_override, prev_qos_override;
2296         int new_effective_qos;
2297         boolean_t has_thread_reference = FALSE;
2298
2299         _canonicalize_resource_and_type(&resource, &resource_type);
2300
2301         if (first_override_for_resource) {
2302                 override = _find_qos_override(thread, resource, resource_type);
2303                 if (override) {
2304                         override->override_contended_resource_count++;
2305                 } else {
2306                         struct thread_qos_override *override_new;
2307
2308                         /* We need to allocate a new object. Drop the task lock and recheck afterwards in case someone else added the override */
2309                         thread_reference(thread);
2310                         has_thread_reference = TRUE;
2311                         task_unlock(task);
2312                         override_new = zalloc(thread_qos_override_zone);
2313                         task_lock(task);
2314
2315                         override = _find_qos_override(thread, resource, resource_type);
2316                         if (override) {
2317                                 /* Someone else already allocated while the task lock was dropped */
2318                                 deferred_free_override = override_new;
2319                                 override->override_contended_resource_count++;
2320                         } else {
2321                                 override = override_new;
2322                                 override->override_next = thread->overrides;
2323                                 override->override_contended_resource_count = 1 /* since first_override_for_resource was TRUE */;
2324                                 override->override_resource = resource;
2325                                 override->override_resource_type = resource_type;
2326                                 override->override_qos = THREAD_QOS_UNSPECIFIED;
2327                                 thread->overrides = override;
2328                         }
2329                 }
2330         } else {
2331                 override = _find_qos_override(thread, resource, resource_type);
2332         }
2333
2334         if (override) {
2335                 if (override->override_qos == THREAD_QOS_UNSPECIFIED)
2336                         override->override_qos = override_qos;
2337                 else
2338                         override->override_qos = MAX(override->override_qos, override_qos);
2339         }
2340
2341         /* Determine how to combine the various overrides into a single current requested override */
2342         prev_qos_override = requested.thrp_qos_override;
2343         new_qos_override = _calculate_requested_qos_override(thread);
2344
2345         if (new_qos_override != prev_qos_override) {
2346                 requested.thrp_qos_override = new_qos_override;
2347
2348                 thread->requested_policy = requested;
2349
2350                 task_policy_update_locked(task, thread, &pend_token);
2351
2352                 if (!has_thread_reference) {
2353                         thread_reference(thread);
2354                 }
2355
2356                 task_unlock(task);
2357
2358                 task_policy_update_complete_unlocked(task, thread, &pend_token);
2359
2360                 new_effective_qos = thread->effective_policy.thep_qos;
2361
2362                 thread_deallocate(thread);
2363         } else {
2364                 new_effective_qos = thread->effective_policy.thep_qos;
2365
2366                 task_unlock(task);
2367
2368                 if (has_thread_reference) {
2369                         thread_deallocate(thread);
2370                 }
2371         }
2372
2373         if (deferred_free_override) {
2374                 zfree(thread_qos_override_zone, deferred_free_override);
2375         }
2376
2377         DTRACE_BOOST3(qos_add_override_post, int, prev_qos_override, int, new_qos_override,
2378                                   int, new_effective_qos);
2379
2380         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_END,
2381                                                   new_qos_override, resource, resource_type, 0, 0);
2382
2383         return TRUE;
2384 }
2385
2386
2387 static boolean_t _proc_thread_qos_remove_override_internal(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type, boolean_t reset)
2388 {
2389         thread_t        self = current_thread();
2390         struct task_pend_token pend_token = {};
2391
2392         /* XXX move to thread mutex when thread policy does */
2393         task_lock(task);
2394
2395         /*
2396          * If thread is passed, it is assumed to be most accurate, since the caller must have an explicit (or implicit) reference
2397          * to the thread
2398          */
2399         if (thread != THREAD_NULL) {
2400                 assert(task == thread->task);
2401         } else {
2402                 if (tid == self->thread_id) {
2403                         thread = self;
2404                 } else {
2405                         thread = task_findtid(task, tid);
2406
2407                         if (thread == THREAD_NULL) {
2408                                 KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE,
2409                                                                           tid, 0, 0xdead, 0, 0);
2410                                 task_unlock(task);
2411                                 return FALSE;
2412                         }
2413                 }
2414         }
2415
2416         struct task_requested_policy requested = thread->requested_policy;
2417         struct thread_qos_override *deferred_free_override_list = NULL;
2418         int new_qos_override, prev_qos_override;
2419
2420         _canonicalize_resource_and_type(&resource, &resource_type);
2421
2422         _find_and_decrement_qos_override(thread, resource, resource_type, reset, &deferred_free_override_list);
2423
2424         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_START,
2425                                                   thread_tid(thread), resource, reset, 0, 0);
2426
2427         /* Determine how to combine the various overrides into a single current requested override */
2428         prev_qos_override = requested.thrp_qos_override;
2429         new_qos_override = _calculate_requested_qos_override(thread);
2430
2431         if (new_qos_override != prev_qos_override) {
2432                 requested.thrp_qos_override = new_qos_override;
2433
2434                 thread->requested_policy = requested;
2435
2436                 task_policy_update_locked(task, thread, &pend_token);
2437
2438                 thread_reference(thread);
2439
2440                 task_unlock(task);
2441
2442                 task_policy_update_complete_unlocked(task, thread, &pend_token);
2443
2444                 thread_deallocate(thread);
2445         } else {
2446                 task_unlock(task);
2447         }
2448
2449         while (deferred_free_override_list) {
2450                 struct thread_qos_override *override_next = deferred_free_override_list->override_next;
2451
2452                 zfree(thread_qos_override_zone, deferred_free_override_list);
2453                 deferred_free_override_list = override_next;
2454         }
2455
2456         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_END,
2457                                                   0, 0, 0, 0, 0);
2458
2459         return TRUE;
2460 }
2461
2462 boolean_t proc_thread_qos_remove_override(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type)
2463 {
2464         return _proc_thread_qos_remove_override_internal(task, thread, tid, resource, resource_type, FALSE);
2465
2466 }
2467
2468 boolean_t proc_thread_qos_reset_override(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type)
2469 {
2470         return _proc_thread_qos_remove_override_internal(task, thread, tid, resource, resource_type, TRUE);
2471 }
2472
2473 /* Deallocate before thread termination */
2474 void proc_thread_qos_deallocate(thread_t thread)
2475 {
2476         task_t task = thread->task;
2477         struct thread_qos_override *override;
2478
2479         /* XXX move to thread mutex when thread policy does */
2480         task_lock(task);
2481         override = thread->overrides;
2482         thread->overrides = NULL;               /* task policy re-evaluation needed? */
2483         thread->requested_policy.thrp_qos_override = THREAD_QOS_UNSPECIFIED;
2484         task_unlock(task);
2485
2486         while (override) {
2487                 struct thread_qos_override *override_next = override->override_next;
2488
2489                 zfree(thread_qos_override_zone, override);
2490                 override = override_next;
2491         }
2492 }
2493
2494 /* TODO: remove this variable when interactive daemon audit period is over */
2495 extern boolean_t ipc_importance_interactive_receiver;
2496
2497 /*
2498  * Called at process exec to initialize the apptype, qos clamp, and qos seed of a process
2499  *
2500  * TODO: Make this function more table-driven instead of ad-hoc
2501  */
2502 void
2503 proc_set_task_spawnpolicy(task_t task, int apptype, int qos_clamp,
2504                           ipc_port_t * portwatch_ports, int portwatch_count)
2505 {
2506         struct task_pend_token pend_token = {};
2507
2508         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2509                                   (IMPORTANCE_CODE(IMP_TASK_APPTYPE, apptype)) | DBG_FUNC_START,
2510                                   audit_token_pid_from_task(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL),
2511                                   apptype, 0);
2512
2513         switch (apptype) {
2514                 case TASK_APPTYPE_APP_TAL:
2515                 case TASK_APPTYPE_APP_DEFAULT:
2516                         /* Apps become donors via the 'live-donor' flag instead of the static donor flag */
2517                         task_importance_mark_donor(task, FALSE);
2518                         task_importance_mark_live_donor(task, TRUE);
2519                         task_importance_mark_receiver(task, FALSE);
2520                         /* Apps are de-nap recievers on desktop for suppression behaviors */
2521                         task_importance_mark_denap_receiver(task, TRUE);
2522                         break;
2523
2524                 case TASK_APPTYPE_DAEMON_INTERACTIVE:
2525                         task_importance_mark_donor(task, TRUE);
2526                         task_importance_mark_live_donor(task, FALSE);
2527
2528                         /*
2529                          * A boot arg controls whether interactive daemons are importance receivers.
2530                          * Normally, they are not.  But for testing their behavior as an adaptive
2531                          * daemon, the boot-arg can be set.
2532                          *
2533                          * TODO: remove this when the interactive daemon audit period is over.
2534                          */
2535                         task_importance_mark_receiver(task, /* FALSE */ ipc_importance_interactive_receiver);
2536                         task_importance_mark_denap_receiver(task, FALSE);
2537                         break;
2538
2539                 case TASK_APPTYPE_DAEMON_STANDARD:
2540                         task_importance_mark_donor(task, TRUE);
2541                         task_importance_mark_live_donor(task, FALSE);
2542                         task_importance_mark_receiver(task, FALSE);
2543                         task_importance_mark_denap_receiver(task, FALSE);
2544                         break;
2545
2546                 case TASK_APPTYPE_DAEMON_ADAPTIVE:
2547                         task_importance_mark_donor(task, FALSE);
2548                         task_importance_mark_live_donor(task, FALSE);
2549                         task_importance_mark_receiver(task, TRUE);
2550                         task_importance_mark_denap_receiver(task, FALSE);
2551                         break;
2552
2553                 case TASK_APPTYPE_DAEMON_BACKGROUND:
2554                         task_importance_mark_donor(task, FALSE);
2555                         task_importance_mark_live_donor(task, FALSE);
2556                         task_importance_mark_receiver(task, FALSE);
2557                         task_importance_mark_denap_receiver(task, FALSE);
2558                         break;
2559
2560                 case TASK_APPTYPE_NONE:
2561                         break;
2562         }
2563
2564         if (portwatch_ports != NULL && apptype == TASK_APPTYPE_DAEMON_ADAPTIVE) {
2565                 int portwatch_boosts = 0;
2566
2567                 for (int i = 0; i < portwatch_count; i++) {
2568                         ipc_port_t port = NULL;
2569
2570                         if ((port = portwatch_ports[i]) != NULL) {
2571                                 int boost = 0;
2572                                 task_add_importance_watchport(task, port, &boost);
2573                                 portwatch_boosts += boost;
2574                         }
2575                 }
2576
2577                 if (portwatch_boosts > 0) {
2578                         task_importance_hold_internal_assertion(task, portwatch_boosts);
2579                 }
2580         }
2581
2582         task_lock(task);
2583
2584         if (apptype == TASK_APPTYPE_APP_TAL) {
2585                 /* TAL starts off enabled by default */
2586                 task->requested_policy.t_tal_enabled = 1;
2587         }
2588
2589         if (apptype != TASK_APPTYPE_NONE) {
2590                 task->requested_policy.t_apptype = apptype;
2591
2592         }
2593
2594         if (qos_clamp != THREAD_QOS_UNSPECIFIED) {
2595                 task->requested_policy.t_qos_clamp = qos_clamp;
2596         }
2597
2598         task_policy_update_locked(task, THREAD_NULL, &pend_token);
2599
2600         task_unlock(task);
2601
2602         /* Ensure the donor bit is updated to be in sync with the new live donor status */
2603         pend_token.tpt_update_live_donor = 1;
2604
2605         task_policy_update_complete_unlocked(task, THREAD_NULL, &pend_token);
2606
2607         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2608                                   (IMPORTANCE_CODE(IMP_TASK_APPTYPE, apptype)) | DBG_FUNC_END,
2609                                   audit_token_pid_from_task(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL),
2610                                   task_is_importance_receiver(task), 0);
2611 }
2612
2613 /* Set up the primordial thread's QoS */
2614 void
2615 task_set_main_thread_qos(task_t task, thread_t main_thread) {
2616         struct task_pend_token pend_token = {};
2617
2618         assert(main_thread->task == task);
2619
2620         task_lock(task);
2621
2622         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2623                                   (IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, 0)) | DBG_FUNC_START,
2624                                   audit_token_pid_from_task(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL),
2625                                   main_thread->requested_policy.thrp_qos, 0);
2626
2627         int primordial_qos = THREAD_QOS_UNSPECIFIED;
2628
2629         int qos_clamp = task->requested_policy.t_qos_clamp;
2630
2631         switch (task->requested_policy.t_apptype) {
2632                 case TASK_APPTYPE_APP_TAL:
2633                 case TASK_APPTYPE_APP_DEFAULT:
2634                         primordial_qos = THREAD_QOS_USER_INTERACTIVE;
2635                         break;
2636
2637                 case TASK_APPTYPE_DAEMON_INTERACTIVE:
2638                 case TASK_APPTYPE_DAEMON_STANDARD:
2639                 case TASK_APPTYPE_DAEMON_ADAPTIVE:
2640                         primordial_qos = THREAD_QOS_LEGACY;
2641                         break;
2642
2643                 case TASK_APPTYPE_DAEMON_BACKGROUND:
2644                         primordial_qos = THREAD_QOS_BACKGROUND;
2645                         break;
2646         }
2647
2648         if (qos_clamp != THREAD_QOS_UNSPECIFIED) {
2649                 if (primordial_qos != THREAD_QOS_UNSPECIFIED) {
2650                         primordial_qos = MIN(qos_clamp, primordial_qos);
2651                 } else {
2652                         primordial_qos = qos_clamp;
2653                 }
2654         }
2655
2656         main_thread->requested_policy.thrp_qos = primordial_qos;
2657
2658         task_policy_update_locked(task, main_thread, &pend_token);
2659
2660         task_unlock(task);
2661
2662         task_policy_update_complete_unlocked(task, main_thread, &pend_token);
2663
2664         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2665                                   (IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, 0)) | DBG_FUNC_END,
2666                                   audit_token_pid_from_task(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL),
2667                                   primordial_qos, 0);
2668 }
2669
2670 /* for process_policy to check before attempting to set */
2671 boolean_t
2672 proc_task_is_tal(task_t task)
2673 {
2674         return (task->requested_policy.t_apptype == TASK_APPTYPE_APP_TAL) ? TRUE : FALSE;
2675 }
2676
2677 /* for telemetry */
2678 integer_t
2679 task_grab_latency_qos(task_t task)
2680 {
2681         return qos_latency_policy_package(proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS));
2682 }
2683
2684 /* update the darwin background action state in the flags field for libproc */
2685 int
2686 proc_get_darwinbgstate(task_t task, uint32_t * flagsp)
2687 {
2688         if (task->requested_policy.ext_darwinbg)
2689                 *flagsp |= PROC_FLAG_EXT_DARWINBG;
2690
2691         if (task->requested_policy.int_darwinbg)
2692                 *flagsp |= PROC_FLAG_DARWINBG;
2693
2694
2695         if (task->requested_policy.t_apptype == TASK_APPTYPE_APP_DEFAULT ||
2696             task->requested_policy.t_apptype == TASK_APPTYPE_APP_TAL)
2697                 *flagsp |= PROC_FLAG_APPLICATION;
2698
2699         if (task->requested_policy.t_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE)
2700                 *flagsp |= PROC_FLAG_ADAPTIVE;
2701
2702         if (task->requested_policy.t_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE && task->requested_policy.t_boosted == 1)
2703                 *flagsp |= PROC_FLAG_ADAPTIVE_IMPORTANT;
2704
2705         if (task_is_importance_donor(task))
2706                 *flagsp |= PROC_FLAG_IMPORTANCE_DONOR;
2707
2708         if (task->effective_policy.t_sup_active)
2709                 *flagsp |= PROC_FLAG_SUPPRESSED;
2710
2711         return(0);
2712 }
2713
2714 /* All per-thread state is in the first 32-bits of the bitfield */
2715 void
2716 proc_get_thread_policy(thread_t thread, thread_policy_state_t info)
2717 {
2718         task_t task = thread->task;
2719         task_lock(task);
2720         info->requested = (integer_t)task_requested_bitfield(task, thread);
2721         info->effective = (integer_t)task_effective_bitfield(task, thread);
2722         info->pending   = 0;
2723         task_unlock(task);
2724 }
2725
2726 /*
2727  * Tracepoint data... Reading the tracepoint data can be somewhat complicated.
2728  * The current scheme packs as much data into a single tracepoint as it can.
2729  *
2730  * Each task/thread requested/effective structure is 64 bits in size. Any
2731  * given tracepoint will emit either requested or effective data, but not both.
2732  *
2733  * A tracepoint may emit any of task, thread, or task & thread data.
2734  *
2735  * The type of data emitted varies with pointer size. Where possible, both
2736  * task and thread data are emitted. In LP32 systems, the first and second
2737  * halves of either the task or thread data is emitted.
2738  *
2739  * The code uses uintptr_t array indexes instead of high/low to avoid
2740  * confusion WRT big vs little endian.
2741  *
2742  * The truth table for the tracepoint data functions is below, and has the
2743  * following invariants:
2744  *
2745  * 1) task and thread are uintptr_t*
2746  * 2) task may never be NULL
2747  *
2748  *
2749  *                                     LP32            LP64
2750  * trequested_0(task, NULL)            task[0]         task[0]
2751  * trequested_1(task, NULL)            task[1]         NULL
2752  * trequested_0(task, thread)          thread[0]       task[0]
2753  * trequested_1(task, thread)          thread[1]       thread[0]
2754  *
2755  * Basically, you get a full task or thread on LP32, and both on LP64.
2756  *
2757  * The uintptr_t munging here is squicky enough to deserve a comment.
2758  *
2759  * The variables we are accessing are laid out in memory like this:
2760  *
2761  * [            LP64 uintptr_t  0          ]
2762  * [ LP32 uintptr_t 0 ] [ LP32 uintptr_t 1 ]
2763  *
2764  *      1   2   3   4     5   6   7   8
2765  *
2766  */
2767
2768 static uintptr_t
2769 trequested_0(task_t task, thread_t thread)
2770 {
2771         assert(task);
2772         _Static_assert(sizeof(struct task_requested_policy) == sizeof(uint64_t), "size invariant violated");
2773         _Static_assert(sizeof(task->requested_policy) == sizeof(thread->requested_policy), "size invariant violated");
2774
2775         uintptr_t* raw = (uintptr_t*)((thread == THREAD_NULL) ? &task->requested_policy : &thread->requested_policy);
2776         return raw[0];
2777 }
2778
2779 static uintptr_t
2780 trequested_1(task_t task, thread_t thread)
2781 {
2782         assert(task);
2783         _Static_assert(sizeof(struct task_requested_policy) == sizeof(uint64_t), "size invariant violated");
2784         _Static_assert(sizeof(task->requested_policy) == sizeof(thread->requested_policy), "size invariant violated");
2785
2786 #if defined __LP64__
2787         return (thread == NULL) ? 0 : *(uintptr_t*)&thread->requested_policy;
2788 #else
2789         uintptr_t* raw = (uintptr_t*)((thread == THREAD_NULL) ? &task->requested_policy : &thread->requested_policy);
2790         return raw[1];
2791 #endif
2792 }
2793
2794 static uintptr_t
2795 teffective_0(task_t task, thread_t thread)
2796 {
2797         assert(task);
2798         _Static_assert(sizeof(struct task_effective_policy) == sizeof(uint64_t), "size invariant violated");
2799         _Static_assert(sizeof(task->effective_policy) == sizeof(thread->effective_policy), "size invariant violated");
2800
2801         uintptr_t* raw = (uintptr_t*)((thread == THREAD_NULL) ? &task->effective_policy : &thread->effective_policy);
2802         return raw[0];
2803 }
2804
2805 static uintptr_t
2806 teffective_1(task_t task, thread_t thread)
2807 {
2808         assert(task);
2809         _Static_assert(sizeof(struct task_effective_policy) == sizeof(uint64_t), "size invariant violated");
2810         _Static_assert(sizeof(task->effective_policy) == sizeof(thread->effective_policy), "size invariant violated");
2811
2812 #if defined __LP64__
2813         return (thread == NULL) ? 0 : *(uintptr_t*)&thread->effective_policy;
2814 #else
2815         uintptr_t* raw = (uintptr_t*)((thread == THREAD_NULL) ? &task->effective_policy : &thread->effective_policy);
2816         return raw[1];
2817 #endif
2818 }
2819
2820 /* dump pending for tracepoint */
2821 static uint32_t tpending(task_pend_token_t pend_token) { return *(uint32_t*)(void*)(pend_token); }
2822
2823 uint64_t
2824 task_requested_bitfield(task_t task, thread_t thread)
2825 {
2826         uint64_t bits = 0;
2827         struct task_requested_policy requested =
2828                 (thread == THREAD_NULL) ? task->requested_policy : thread->requested_policy;
2829
2830         bits |= (requested.int_darwinbg         ? POLICY_REQ_INT_DARWIN_BG  : 0);
2831         bits |= (requested.ext_darwinbg         ? POLICY_REQ_EXT_DARWIN_BG  : 0);
2832         bits |= (requested.int_iotier           ? (((uint64_t)requested.int_iotier) << POLICY_REQ_INT_IO_TIER_SHIFT) : 0);
2833         bits |= (requested.ext_iotier           ? (((uint64_t)requested.ext_iotier) << POLICY_REQ_EXT_IO_TIER_SHIFT) : 0);
2834         bits |= (requested.int_iopassive        ? POLICY_REQ_INT_PASSIVE_IO : 0);
2835         bits |= (requested.ext_iopassive        ? POLICY_REQ_EXT_PASSIVE_IO : 0);
2836         bits |= (requested.bg_iotier            ? (((uint64_t)requested.bg_iotier)  << POLICY_REQ_BG_IOTIER_SHIFT)   : 0);
2837         bits |= (requested.terminated           ? POLICY_REQ_TERMINATED     : 0);
2838
2839         bits |= (requested.th_pidbind_bg        ? POLICY_REQ_PIDBIND_BG     : 0);
2840         bits |= (requested.th_workq_bg          ? POLICY_REQ_WORKQ_BG       : 0);
2841
2842         if (thread != THREAD_NULL) {
2843                 bits |= (requested.thrp_qos     ? (((uint64_t)requested.thrp_qos)   << POLICY_REQ_TH_QOS_SHIFT)  : 0);
2844                 bits |= (requested.thrp_qos_override     ? (((uint64_t)requested.thrp_qos_override)   << POLICY_REQ_TH_QOS_OVER_SHIFT)  : 0);
2845         }
2846
2847         bits |= (requested.t_boosted            ? POLICY_REQ_BOOSTED        : 0);
2848         bits |= (requested.t_tal_enabled        ? POLICY_REQ_TAL_ENABLED    : 0);
2849         bits |= (requested.t_apptype            ? (((uint64_t)requested.t_apptype)    << POLICY_REQ_APPTYPE_SHIFT)  : 0);
2850         bits |= (requested.t_role               ? (((uint64_t)requested.t_role)       << POLICY_REQ_ROLE_SHIFT)     : 0);
2851
2852         bits |= (requested.t_sup_active         ? POLICY_REQ_SUP_ACTIVE         : 0);
2853         bits |= (requested.t_sup_lowpri_cpu     ? POLICY_REQ_SUP_LOWPRI_CPU     : 0);
2854         bits |= (requested.t_sup_cpu            ? POLICY_REQ_SUP_CPU            : 0);
2855         bits |= (requested.t_sup_timer          ? (((uint64_t)requested.t_sup_timer)  << POLICY_REQ_SUP_TIMER_THROTTLE_SHIFT) : 0);
2856         bits |= (requested.t_sup_throughput     ? (((uint64_t)requested.t_sup_throughput)   << POLICY_REQ_SUP_THROUGHPUT_SHIFT)   : 0);
2857         bits |= (requested.t_sup_disk           ? POLICY_REQ_SUP_DISK_THROTTLE  : 0);
2858         bits |= (requested.t_sup_cpu_limit      ? POLICY_REQ_SUP_CPU_LIMIT      : 0);
2859         bits |= (requested.t_sup_suspend        ? POLICY_REQ_SUP_SUSPEND        : 0);
2860         bits |= (requested.t_sup_bg_sockets     ? POLICY_REQ_SUP_BG_SOCKETS     : 0);
2861         bits |= (requested.t_base_latency_qos   ? (((uint64_t)requested.t_base_latency_qos) << POLICY_REQ_BASE_LATENCY_QOS_SHIFT) : 0);
2862         bits |= (requested.t_over_latency_qos   ? (((uint64_t)requested.t_over_latency_qos) << POLICY_REQ_OVER_LATENCY_QOS_SHIFT) : 0);
2863         bits |= (requested.t_base_through_qos   ? (((uint64_t)requested.t_base_through_qos) << POLICY_REQ_BASE_THROUGH_QOS_SHIFT) : 0);
2864         bits |= (requested.t_over_through_qos   ? (((uint64_t)requested.t_over_through_qos) << POLICY_REQ_OVER_THROUGH_QOS_SHIFT) : 0);
2865         bits |= (requested.t_sfi_managed        ? POLICY_REQ_SFI_MANAGED        : 0);
2866         bits |= (requested.t_qos_clamp          ? (((uint64_t)requested.t_qos_clamp)        << POLICY_REQ_QOS_CLAMP_SHIFT)        : 0);
2867
2868         return bits;
2869 }
2870
2871 uint64_t
2872 task_effective_bitfield(task_t task, thread_t thread)
2873 {
2874         uint64_t bits = 0;
2875         struct task_effective_policy effective =
2876                 (thread == THREAD_NULL) ? task->effective_policy : thread->effective_policy;
2877
2878         bits |= (effective.io_tier              ? (((uint64_t)effective.io_tier) << POLICY_EFF_IO_TIER_SHIFT) : 0);
2879         bits |= (effective.io_passive           ? POLICY_EFF_IO_PASSIVE     : 0);
2880         bits |= (effective.darwinbg             ? POLICY_EFF_DARWIN_BG      : 0);
2881         bits |= (effective.lowpri_cpu           ? POLICY_EFF_LOWPRI_CPU     : 0);
2882         bits |= (effective.terminated           ? POLICY_EFF_TERMINATED     : 0);
2883         bits |= (effective.all_sockets_bg       ? POLICY_EFF_ALL_SOCKETS_BG : 0);
2884         bits |= (effective.new_sockets_bg       ? POLICY_EFF_NEW_SOCKETS_BG : 0);
2885         bits |= (effective.bg_iotier            ? (((uint64_t)effective.bg_iotier) << POLICY_EFF_BG_IOTIER_SHIFT) : 0);
2886         bits |= (effective.qos_ui_is_urgent     ? POLICY_EFF_QOS_UI_IS_URGENT : 0);
2887
2888         if (thread != THREAD_NULL)
2889                 bits |= (effective.thep_qos     ? (((uint64_t)effective.thep_qos)   << POLICY_EFF_TH_QOS_SHIFT)  : 0);
2890
2891         bits |= (effective.t_tal_engaged        ? POLICY_EFF_TAL_ENGAGED    : 0);
2892         bits |= (effective.t_suspended          ? POLICY_EFF_SUSPENDED      : 0);
2893         bits |= (effective.t_watchers_bg        ? POLICY_EFF_WATCHERS_BG    : 0);
2894         bits |= (effective.t_sup_active         ? POLICY_EFF_SUP_ACTIVE     : 0);
2895         bits |= (effective.t_suppressed_cpu     ? POLICY_EFF_SUP_CPU        : 0);
2896         bits |= (effective.t_role               ? (((uint64_t)effective.t_role)        << POLICY_EFF_ROLE_SHIFT)        : 0);
2897         bits |= (effective.t_latency_qos        ? (((uint64_t)effective.t_latency_qos) << POLICY_EFF_LATENCY_QOS_SHIFT) : 0);
2898         bits |= (effective.t_through_qos        ? (((uint64_t)effective.t_through_qos) << POLICY_EFF_THROUGH_QOS_SHIFT) : 0);
2899         bits |= (effective.t_sfi_managed        ? POLICY_EFF_SFI_MANAGED    : 0);
2900         bits |= (effective.t_qos_ceiling        ? (((uint64_t)effective.t_qos_ceiling) << POLICY_EFF_QOS_CEILING_SHIFT) : 0);
2901
2902         return bits;
2903 }
2904
2905
2906 /*
2907  * Resource usage and CPU related routines
2908  */
2909
2910 int
2911 proc_get_task_ruse_cpu(task_t task, uint32_t *policyp, uint8_t *percentagep, uint64_t *intervalp, uint64_t *deadlinep)
2912 {
2913
2914         int error = 0;
2915         int scope;
2916
2917         task_lock(task);
2918
2919
2920         error = task_get_cpuusage(task, percentagep, intervalp, deadlinep, &scope);
2921         task_unlock(task);
2922
2923         /*
2924          * Reverse-map from CPU resource limit scopes back to policies (see comment below).
2925          */
2926         if (scope == TASK_RUSECPU_FLAGS_PERTHR_LIMIT) {
2927                 *policyp = TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_EXC;
2928         } else if (scope == TASK_RUSECPU_FLAGS_PROC_LIMIT) {
2929                 *policyp = TASK_POLICY_RESOURCE_ATTRIBUTE_THROTTLE;
2930         } else if (scope == TASK_RUSECPU_FLAGS_DEADLINE) {
2931                 *policyp = TASK_POLICY_RESOURCE_ATTRIBUTE_NONE;
2932         }
2933
2934         return(error);
2935 }
2936
2937 /*
2938  * Configure the default CPU usage monitor parameters.
2939  *
2940  * For tasks which have this mechanism activated: if any thread in the
2941  * process consumes more CPU than this, an EXC_RESOURCE exception will be generated.
2942  */
2943 void
2944 proc_init_cpumon_params(void)
2945 {
2946         if (!PE_parse_boot_argn("max_cpumon_percentage", &proc_max_cpumon_percentage,
2947                 sizeof (proc_max_cpumon_percentage))) {
2948                 proc_max_cpumon_percentage = DEFAULT_CPUMON_PERCENTAGE;
2949         }
2950
2951         if (proc_max_cpumon_percentage > 100) {
2952                 proc_max_cpumon_percentage = 100;
2953         }
2954
2955         /* The interval should be specified in seconds. */
2956         if (!PE_parse_boot_argn("max_cpumon_interval", &proc_max_cpumon_interval,
2957                 sizeof (proc_max_cpumon_interval))) {
2958                 proc_max_cpumon_interval = DEFAULT_CPUMON_INTERVAL;
2959         }
2960
2961         proc_max_cpumon_interval *= NSEC_PER_SEC;
2962
2963         /* TEMPORARY boot arg to control App suppression */
2964         PE_parse_boot_argn("task_policy_suppression_disable",
2965                            &task_policy_suppression_disable,
2966                            sizeof(task_policy_suppression_disable));
2967 }
2968
2969 /*
2970  * Currently supported configurations for CPU limits.
2971  *
2972  * Policy                               | Deadline-based CPU limit | Percentage-based CPU limit
2973  * -------------------------------------+--------------------------+------------------------------
2974  * PROC_POLICY_RSRCACT_THROTTLE         | ENOTSUP                  | Task-wide scope only
2975  * PROC_POLICY_RSRCACT_SUSPEND          | Task-wide scope only     | ENOTSUP
2976  * PROC_POLICY_RSRCACT_TERMINATE        | Task-wide scope only     | ENOTSUP
2977  * PROC_POLICY_RSRCACT_NOTIFY_KQ        | Task-wide scope only     | ENOTSUP
2978  * PROC_POLICY_RSRCACT_NOTIFY_EXC       | ENOTSUP                  | Per-thread scope only
2979  *
2980  * A deadline-based CPU limit is actually a simple wallclock timer - the requested action is performed
2981  * after the specified amount of wallclock time has elapsed.
2982  *
2983  * A percentage-based CPU limit performs the requested action after the specified amount of actual CPU time
2984  * has been consumed -- regardless of how much wallclock time has elapsed -- by either the task as an
2985  * aggregate entity (so-called "Task-wide" or "Proc-wide" scope, whereby the CPU time consumed by all threads
2986  * in the task are added together), or by any one thread in the task (so-called "per-thread" scope).
2987  *
2988  * We support either deadline != 0 OR percentage != 0, but not both. The original intention in having them
2989  * share an API was to use actual CPU time as the basis of the deadline-based limit (as in: perform an action
2990  * after I have used some amount of CPU time; this is different than the recurring percentage/interval model)
2991  * but the potential consumer of the API at the time was insisting on wallclock time instead.
2992  *
2993  * Currently, requesting notification via an exception is the only way to get per-thread scope for a
2994  * CPU limit. All other types of notifications force task-wide scope for the limit.
2995  */
2996 int
2997 proc_set_task_ruse_cpu(task_t task, uint32_t policy, uint8_t percentage, uint64_t interval, uint64_t deadline,
2998         int cpumon_entitled)
2999 {
3000         int error = 0;
3001         int scope;
3002
3003         /*
3004          * Enforce the matrix of supported configurations for policy, percentage, and deadline.
3005          */
3006         switch (policy) {
3007         // If no policy is explicitly given, the default is to throttle.
3008         case TASK_POLICY_RESOURCE_ATTRIBUTE_NONE:
3009         case TASK_POLICY_RESOURCE_ATTRIBUTE_THROTTLE:
3010                 if (deadline != 0)
3011                         return (ENOTSUP);
3012                 scope = TASK_RUSECPU_FLAGS_PROC_LIMIT;
3013                 break;
3014         case TASK_POLICY_RESOURCE_ATTRIBUTE_SUSPEND:
3015         case TASK_POLICY_RESOURCE_ATTRIBUTE_TERMINATE:
3016         case TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_KQ:
3017                 if (percentage != 0)
3018                         return (ENOTSUP);
3019                 scope = TASK_RUSECPU_FLAGS_DEADLINE;
3020                 break;
3021         case TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_EXC:
3022                 if (deadline != 0)
3023                         return (ENOTSUP);
3024                 scope = TASK_RUSECPU_FLAGS_PERTHR_LIMIT;
3025 #ifdef CONFIG_NOMONITORS
3026                 return (error);
3027 #endif /* CONFIG_NOMONITORS */
3028                 break;
3029         default:
3030                 return (EINVAL);
3031         }
3032
3033         task_lock(task);
3034         if (task != current_task()) {
3035                 task->policy_ru_cpu_ext = policy;
3036         } else {
3037                 task->policy_ru_cpu = policy;
3038         }
3039         error = task_set_cpuusage(task, percentage, interval, deadline, scope, cpumon_entitled);
3040         task_unlock(task);
3041         return(error);
3042 }
3043
3044 int
3045 proc_clear_task_ruse_cpu(task_t task, int cpumon_entitled)
3046 {
3047         int error = 0;
3048         int action;
3049         void * bsdinfo = NULL;
3050
3051         task_lock(task);
3052         if (task != current_task()) {
3053                 task->policy_ru_cpu_ext = TASK_POLICY_RESOURCE_ATTRIBUTE_DEFAULT;
3054         } else {
3055                 task->policy_ru_cpu = TASK_POLICY_RESOURCE_ATTRIBUTE_DEFAULT;
3056         }
3057
3058         error = task_clear_cpuusage_locked(task, cpumon_entitled);
3059         if (error != 0)
3060                 goto out;
3061
3062         action = task->applied_ru_cpu;
3063         if (task->applied_ru_cpu_ext != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) {
3064                 /* reset action */
3065                 task->applied_ru_cpu_ext = TASK_POLICY_RESOURCE_ATTRIBUTE_NONE;
3066         }
3067         if (action != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) {
3068                 bsdinfo = task->bsd_info;
3069                 task_unlock(task);
3070                 proc_restore_resource_actions(bsdinfo, TASK_POLICY_CPU_RESOURCE_USAGE, action);
3071                 goto out1;
3072         }
3073
3074 out:
3075         task_unlock(task);
3076 out1:
3077         return(error);
3078
3079 }
3080
3081 /* used to apply resource limit related actions */
3082 static int
3083 task_apply_resource_actions(task_t task, int type)
3084 {
3085         int action = TASK_POLICY_RESOURCE_ATTRIBUTE_NONE;
3086         void * bsdinfo = NULL;
3087
3088         switch (type) {
3089                 case TASK_POLICY_CPU_RESOURCE_USAGE:
3090                         break;
3091                 case TASK_POLICY_WIREDMEM_RESOURCE_USAGE:
3092                 case TASK_POLICY_VIRTUALMEM_RESOURCE_USAGE:
3093                 case TASK_POLICY_DISK_RESOURCE_USAGE:
3094                 case TASK_POLICY_NETWORK_RESOURCE_USAGE:
3095                 case TASK_POLICY_POWER_RESOURCE_USAGE:
3096                         return(0);
3097
3098                 default:
3099                         return(1);
3100         };
3101
3102         /* only cpu actions for now */
3103         task_lock(task);
3104
3105         if (task->applied_ru_cpu_ext == TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) {
3106                 /* apply action */
3107                 task->applied_ru_cpu_ext = task->policy_ru_cpu_ext;
3108                 action = task->applied_ru_cpu_ext;
3109         } else {
3110                 action = task->applied_ru_cpu_ext;
3111         }
3112
3113         if (action != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) {
3114                 bsdinfo = task->bsd_info;
3115                 task_unlock(task);
3116                 proc_apply_resource_actions(bsdinfo, TASK_POLICY_CPU_RESOURCE_USAGE, action);
3117         } else
3118                 task_unlock(task);
3119
3120         return(0);
3121 }
3122
3123 /*
3124  * XXX This API is somewhat broken; we support multiple simultaneous CPU limits, but the get/set API
3125  * only allows for one at a time. This means that if there is a per-thread limit active, the other
3126  * "scopes" will not be accessible via this API. We could change it to pass in the scope of interest
3127  * to the caller, and prefer that, but there's no need for that at the moment.
3128  */
3129 int
3130 task_get_cpuusage(task_t task, uint8_t *percentagep, uint64_t *intervalp, uint64_t *deadlinep, int *scope)
3131 {
3132         *percentagep = 0;
3133         *intervalp = 0;
3134         *deadlinep = 0;
3135
3136         if ((task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) != 0) {
3137                 *scope = TASK_RUSECPU_FLAGS_PERTHR_LIMIT;
3138                 *percentagep = task->rusage_cpu_perthr_percentage;
3139                 *intervalp = task->rusage_cpu_perthr_interval;
3140         } else if ((task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PROC_LIMIT) != 0) {
3141                 *scope = TASK_RUSECPU_FLAGS_PROC_LIMIT;
3142                 *percentagep = task->rusage_cpu_percentage;
3143                 *intervalp = task->rusage_cpu_interval;
3144         } else if ((task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_DEADLINE) != 0) {
3145                 *scope = TASK_RUSECPU_FLAGS_DEADLINE;
3146                 *deadlinep = task->rusage_cpu_deadline;
3147         } else {
3148                 *scope = 0;
3149         }
3150
3151         return(0);
3152 }
3153
3154 /*
3155  * Disable the CPU usage monitor for the task. Return value indicates
3156  * if the mechanism was actually enabled.
3157  */
3158 int
3159 task_disable_cpumon(task_t task) {
3160         thread_t thread;
3161
3162         task_lock_assert_owned(task);
3163
3164         if ((task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) == 0) {
3165                 return (KERN_INVALID_ARGUMENT);
3166         }
3167
3168 #if CONFIG_TELEMETRY
3169         /*
3170          * Disable task-wide telemetry if it was ever enabled by the CPU usage
3171          * monitor's warning zone.
3172          */
3173         telemetry_task_ctl_locked(task, TF_CPUMON_WARNING, 0);
3174 #endif
3175
3176         /*
3177          * Disable the monitor for the task, and propagate that change to each thread.
3178          */
3179         task->rusage_cpu_flags &= ~(TASK_RUSECPU_FLAGS_PERTHR_LIMIT | TASK_RUSECPU_FLAGS_FATAL_CPUMON);
3180         queue_iterate(&task->threads, thread, thread_t, task_threads) {
3181                 set_astledger(thread);
3182         }
3183         task->rusage_cpu_perthr_percentage = 0;
3184         task->rusage_cpu_perthr_interval = 0;
3185
3186         return (KERN_SUCCESS);
3187 }
3188
3189 int
3190 task_set_cpuusage(task_t task, uint8_t percentage, uint64_t interval, uint64_t deadline, int scope, int cpumon_entitled)
3191 {
3192         thread_t thread;
3193         uint64_t abstime = 0;
3194         uint64_t limittime = 0;
3195
3196         lck_mtx_assert(&task->lock, LCK_MTX_ASSERT_OWNED);
3197
3198         /* By default, refill once per second */
3199         if (interval == 0)
3200                 interval = NSEC_PER_SEC;
3201
3202         if (percentage != 0) {
3203                 if (scope == TASK_RUSECPU_FLAGS_PERTHR_LIMIT) {
3204                         boolean_t warn = FALSE;
3205
3206                         /*
3207                          * A per-thread CPU limit on a task generates an exception
3208                          * (LEDGER_ACTION_EXCEPTION) if any one thread in the task
3209                          * exceeds the limit.
3210                          */
3211
3212                         if (percentage == TASK_POLICY_CPUMON_DISABLE) {
3213                                 if (cpumon_entitled) {
3214                                         task_disable_cpumon(task);
3215                                         return (0);
3216                                 }
3217
3218                                 /*
3219                                  * This task wishes to disable the CPU usage monitor, but it's
3220                                  * missing the required entitlement:
3221                                  *     com.apple.private.kernel.override-cpumon
3222                                  *
3223                                  * Instead, treat this as a request to reset its params
3224                                  * back to the defaults.
3225                                  */
3226                                 warn = TRUE;
3227                                 percentage = TASK_POLICY_CPUMON_DEFAULTS;
3228                         }
3229
3230                         if (percentage == TASK_POLICY_CPUMON_DEFAULTS) {
3231                                 percentage = proc_max_cpumon_percentage;
3232                                 interval   = proc_max_cpumon_interval;
3233                         }
3234
3235                         if (percentage > 100) {
3236                                 percentage = 100;
3237                         }
3238
3239                         /*
3240                          * Passing in an interval of -1 means either:
3241                          * - Leave the interval as-is, if there's already a per-thread
3242                          *   limit configured
3243                          * - Use the system default.
3244                          */
3245                         if (interval == -1ULL) {
3246                                 if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) {
3247                                         interval = task->rusage_cpu_perthr_interval;
3248                                 } else {
3249                                         interval = proc_max_cpumon_interval;
3250                                 }
3251                         }
3252
3253                         /*
3254                          * Enforce global caps on CPU usage monitor here if the process is not
3255                          * entitled to escape the global caps.
3256                          */
3257                          if ((percentage > proc_max_cpumon_percentage) && (cpumon_entitled == 0)) {
3258                                 warn = TRUE;
3259                                 percentage = proc_max_cpumon_percentage;
3260                          }
3261
3262                          if ((interval > proc_max_cpumon_interval) && (cpumon_entitled == 0)) {
3263                                 warn = TRUE;
3264                                 interval = proc_max_cpumon_interval;
3265                          }
3266
3267                         if (warn) {
3268                                 int       pid = 0;
3269                                 char      *procname = (char *)"unknown";
3270
3271 #ifdef MACH_BSD
3272                                 pid = proc_selfpid();
3273                                 if (current_task()->bsd_info != NULL) {
3274                                         procname = proc_name_address(current_task()->bsd_info);
3275                                 }
3276 #endif
3277
3278                                 printf("process %s[%d] denied attempt to escape CPU monitor"
3279                                         " (missing required entitlement).\n", procname, pid);
3280                         }
3281
3282                         task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_PERTHR_LIMIT;
3283                         task->rusage_cpu_perthr_percentage = percentage;
3284                         task->rusage_cpu_perthr_interval = interval;
3285                         queue_iterate(&task->threads, thread, thread_t, task_threads) {
3286                                 set_astledger(thread);
3287                         }
3288                 } else if (scope == TASK_RUSECPU_FLAGS_PROC_LIMIT) {
3289                         /*
3290                          * Currently, a proc-wide CPU limit always blocks if the limit is
3291                          * exceeded (LEDGER_ACTION_BLOCK).
3292                          */
3293                         task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_PROC_LIMIT;
3294                         task->rusage_cpu_percentage = percentage;
3295                         task->rusage_cpu_interval = interval;
3296
3297                         limittime = (interval * percentage) / 100;
3298                         nanoseconds_to_absolutetime(limittime, &abstime);
3299
3300                         ledger_set_limit(task->ledger, task_ledgers.cpu_time, abstime, 0);
3301                         ledger_set_period(task->ledger, task_ledgers.cpu_time, interval);
3302                         ledger_set_action(task->ledger, task_ledgers.cpu_time, LEDGER_ACTION_BLOCK);
3303                 }
3304         }
3305
3306         if (deadline != 0) {
3307                 assert(scope == TASK_RUSECPU_FLAGS_DEADLINE);
3308
3309                 /* if already in use, cancel and wait for it to cleanout */
3310                 if (task->rusage_cpu_callt != NULL) {
3311                         task_unlock(task);
3312                         thread_call_cancel_wait(task->rusage_cpu_callt);
3313                         task_lock(task);
3314                 }
3315                 if (task->rusage_cpu_callt == NULL) {
3316                         task->rusage_cpu_callt = thread_call_allocate_with_priority(task_action_cpuusage, (thread_call_param_t)task, THREAD_CALL_PRIORITY_KERNEL);
3317                 }
3318                 /* setup callout */
3319                 if (task->rusage_cpu_callt != 0) {
3320                         uint64_t save_abstime = 0;
3321
3322                         task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_DEADLINE;
3323                         task->rusage_cpu_deadline = deadline;
3324
3325                         nanoseconds_to_absolutetime(deadline, &abstime);
3326                         save_abstime = abstime;
3327                         clock_absolutetime_interval_to_deadline(save_abstime, &abstime);
3328                         thread_call_enter_delayed(task->rusage_cpu_callt, abstime);
3329                 }
3330         }
3331
3332         return(0);
3333 }
3334
3335 int
3336 task_clear_cpuusage(task_t task, int cpumon_entitled)
3337 {
3338         int retval = 0;
3339
3340         task_lock(task);
3341         retval = task_clear_cpuusage_locked(task, cpumon_entitled);
3342         task_unlock(task);
3343
3344         return(retval);
3345 }
3346
3347 int
3348 task_clear_cpuusage_locked(task_t task, int cpumon_entitled)
3349 {
3350         thread_call_t savecallt;
3351
3352         /* cancel percentage handling if set */
3353         if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PROC_LIMIT) {
3354                 task->rusage_cpu_flags &= ~TASK_RUSECPU_FLAGS_PROC_LIMIT;
3355                 ledger_set_limit(task->ledger, task_ledgers.cpu_time, LEDGER_LIMIT_INFINITY, 0);
3356                 task->rusage_cpu_percentage = 0;
3357                 task->rusage_cpu_interval = 0;
3358         }
3359
3360         /*
3361          * Disable the CPU usage monitor.
3362          */
3363         if (cpumon_entitled) {
3364                 task_disable_cpumon(task);
3365         }
3366
3367         /* cancel deadline handling if set */
3368         if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_DEADLINE) {
3369                 task->rusage_cpu_flags &= ~TASK_RUSECPU_FLAGS_DEADLINE;
3370                 if (task->rusage_cpu_callt != 0) {
3371                         savecallt = task->rusage_cpu_callt;
3372                         task->rusage_cpu_callt = NULL;
3373                         task->rusage_cpu_deadline = 0;
3374                         task_unlock(task);
3375                         thread_call_cancel_wait(savecallt);
3376                         thread_call_free(savecallt);
3377                         task_lock(task);
3378                 }
3379         }
3380         return(0);
3381 }
3382
3383 /* called by ledger unit to enforce action due to  resource usage criteria being met */
3384 void
3385 task_action_cpuusage(thread_call_param_t param0, __unused thread_call_param_t param1)
3386 {
3387         task_t task = (task_t)param0;
3388         (void)task_apply_resource_actions(task, TASK_POLICY_CPU_RESOURCE_USAGE);
3389         return;
3390 }
3391
3392
3393 /*
3394  * Routines for taskwatch and pidbind
3395  */
3396
3397
3398 /*
3399  * Routines for importance donation/inheritance/boosting
3400  */
3401
3402 static void
3403 task_importance_update_live_donor(task_t target_task)
3404 {
3405 #if IMPORTANCE_INHERITANCE
3406
3407         ipc_importance_task_t task_imp;
3408
3409         task_imp = ipc_importance_for_task(target_task, FALSE);
3410         if (IIT_NULL != task_imp) {
3411                 ipc_importance_task_update_live_donor(task_imp);
3412                 ipc_importance_task_release(task_imp);
3413         }
3414 #endif /* IMPORTANCE_INHERITANCE */
3415 }
3416
3417 void
3418 task_importance_mark_donor(task_t task, boolean_t donating)
3419 {
3420 #if IMPORTANCE_INHERITANCE
3421         ipc_importance_task_t task_imp;
3422
3423         task_imp = ipc_importance_for_task(task, FALSE);
3424         if (IIT_NULL != task_imp) {
3425                 ipc_importance_task_mark_donor(task_imp, donating);
3426                 ipc_importance_task_release(task_imp);
3427         }
3428 #endif /* IMPORTANCE_INHERITANCE */
3429 }
3430
3431 void
3432 task_importance_mark_live_donor(task_t task, boolean_t live_donating)
3433 {
3434 #if IMPORTANCE_INHERITANCE
3435         ipc_importance_task_t task_imp;
3436
3437         task_imp = ipc_importance_for_task(task, FALSE);
3438         if (IIT_NULL != task_imp) {
3439                 ipc_importance_task_mark_live_donor(task_imp, live_donating);
3440                 ipc_importance_task_release(task_imp);
3441         }
3442 #endif /* IMPORTANCE_INHERITANCE */
3443 }
3444
3445 void
3446 task_importance_mark_receiver(task_t task, boolean_t receiving)
3447 {
3448 #if IMPORTANCE_INHERITANCE
3449         ipc_importance_task_t task_imp;
3450
3451         task_imp = ipc_importance_for_task(task, FALSE);
3452         if (IIT_NULL != task_imp) {
3453                 ipc_importance_task_mark_receiver(task_imp, receiving);
3454                 ipc_importance_task_release(task_imp);
3455         }
3456 #endif /* IMPORTANCE_INHERITANCE */
3457 }
3458
3459 void
3460 task_importance_mark_denap_receiver(task_t task, boolean_t denap)
3461 {
3462 #if IMPORTANCE_INHERITANCE
3463         ipc_importance_task_t task_imp;
3464
3465         task_imp = ipc_importance_for_task(task, FALSE);
3466         if (IIT_NULL != task_imp) {
3467                 ipc_importance_task_mark_denap_receiver(task_imp, denap);
3468                 ipc_importance_task_release(task_imp);
3469         }
3470 #endif /* IMPORTANCE_INHERITANCE */
3471 }
3472
3473 void
3474 task_importance_reset(__imp_only task_t task)
3475 {
3476 #if IMPORTANCE_INHERITANCE
3477         ipc_importance_task_t task_imp;
3478
3479         /* TODO: Lower importance downstream before disconnect */
3480         task_imp = task->task_imp_base;
3481         ipc_importance_reset(task_imp, FALSE);
3482         task_importance_update_live_donor(task);
3483 #endif /* IMPORTANCE_INHERITANCE */
3484 }
3485
3486 #if IMPORTANCE_INHERITANCE
3487
3488 /*
3489  * Sets the task boost bit to the provided value.  Does NOT run the update function.
3490  *
3491  * Task lock must be held.
3492  */
3493 void
3494 task_set_boost_locked(task_t task, boolean_t boost_active)
3495 {
3496 #if IMPORTANCE_DEBUG
3497         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (IMPORTANCE_CODE(IMP_BOOST, (boost_active ? IMP_BOOSTED : IMP_UNBOOSTED)) | DBG_FUNC_START),
3498                                   proc_selfpid(), audit_token_pid_from_task(task), trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL), 0);
3499 #endif
3500
3501         task->requested_policy.t_boosted = boost_active;
3502
3503 #if IMPORTANCE_DEBUG
3504         if (boost_active == TRUE){
3505                 DTRACE_BOOST2(boost, task_t, task, int, audit_token_pid_from_task(task));
3506         } else {
3507                 DTRACE_BOOST2(unboost, task_t, task, int, audit_token_pid_from_task(task));
3508         }
3509         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (IMPORTANCE_CODE(IMP_BOOST, (boost_active ? IMP_BOOSTED : IMP_UNBOOSTED)) | DBG_FUNC_END),
3510                                   proc_selfpid(), audit_token_pid_from_task(task),
3511                                   trequested_0(task, THREAD_NULL), trequested_1(task, THREAD_NULL), 0);
3512 #endif
3513 }
3514
3515 /*
3516  * Sets the task boost bit to the provided value and applies the update.
3517  *
3518  * Task lock must be held.  Must call update complete after unlocking the task.
3519  */
3520 void
3521 task_update_boost_locked(task_t task, boolean_t boost_active, task_pend_token_t pend_token)
3522 {
3523         task_set_boost_locked(task, boost_active);
3524
3525         task_policy_update_locked(task, THREAD_NULL, pend_token);
3526 }
3527
3528 /*
3529  * Check if this task should donate importance.
3530  *
3531  * May be called without taking the task lock. In that case, donor status can change
3532  * so you must check only once for each donation event.
3533  */
3534 boolean_t
3535 task_is_importance_donor(task_t task)
3536 {
3537         if (task->task_imp_base == IIT_NULL)
3538                 return FALSE;
3539         return ipc_importance_task_is_donor(task->task_imp_base);
3540 }
3541
3542 /*
3543  * Query the status of the task's donor mark.
3544  */
3545 boolean_t
3546 task_is_marked_importance_donor(task_t task)
3547 {
3548         if (task->task_imp_base == IIT_NULL)
3549                 return FALSE;
3550         return ipc_importance_task_is_marked_donor(task->task_imp_base);
3551 }
3552
3553 /*
3554  * Query the status of the task's live donor and donor mark.
3555  */
3556 boolean_t
3557 task_is_marked_live_importance_donor(task_t task)
3558 {
3559         if (task->task_imp_base == IIT_NULL)
3560                 return FALSE;
3561         return ipc_importance_task_is_marked_live_donor(task->task_imp_base);
3562 }
3563
3564
3565 /*
3566  * This routine may be called without holding task lock
3567  * since the value of imp_receiver can never be unset.
3568  */
3569 boolean_t
3570 task_is_importance_receiver(task_t task)
3571 {
3572         if (task->task_imp_base == IIT_NULL)
3573                 return FALSE;
3574         return ipc_importance_task_is_marked_receiver(task->task_imp_base);
3575 }
3576
3577 /*
3578  * Query the task's receiver mark.
3579  */
3580 boolean_t
3581 task_is_marked_importance_receiver(task_t task)
3582 {
3583         if (task->task_imp_base == IIT_NULL)
3584                 return FALSE;
3585         return ipc_importance_task_is_marked_receiver(task->task_imp_base);
3586 }
3587
3588 /*
3589  * This routine may be called without holding task lock
3590  * since the value of de-nap receiver can never be unset.
3591  */
3592 boolean_t
3593 task_is_importance_denap_receiver(task_t task)
3594 {
3595         if (task->task_imp_base == IIT_NULL)
3596                 return FALSE;
3597         return ipc_importance_task_is_denap_receiver(task->task_imp_base);
3598 }
3599
3600 /*
3601  * Query the task's de-nap receiver mark.
3602  */
3603 boolean_t
3604 task_is_marked_importance_denap_receiver(task_t task)
3605 {
3606         if (task->task_imp_base == IIT_NULL)
3607                 return FALSE;
3608         return ipc_importance_task_is_marked_denap_receiver(task->task_imp_base);
3609 }
3610
3611 /*
3612  * This routine may be called without holding task lock
3613  * since the value of imp_receiver can never be unset.
3614  */
3615 boolean_t
3616 task_is_importance_receiver_type(task_t task)
3617 {
3618         if (task->task_imp_base == IIT_NULL)
3619                 return FALSE;
3620         return (task_is_importance_receiver(task) ||
3621                 task_is_importance_denap_receiver(task));
3622 }
3623
3624 /*
3625  * External importance assertions are managed by the process in userspace
3626  * Internal importance assertions are the responsibility of the kernel
3627  * Assertions are changed from internal to external via task_importance_externalize_assertion
3628  */
3629
3630 int
3631 task_importance_hold_watchport_assertion(task_t target_task, uint32_t count)
3632 {
3633         ipc_importance_task_t task_imp;
3634         kern_return_t ret;
3635
3636         /* must already have set up an importance */
3637         task_imp = target_task->task_imp_base;
3638         assert(IIT_NULL != task_imp);
3639
3640         ret = ipc_importance_task_hold_internal_assertion(task_imp, count);
3641         return (KERN_SUCCESS != ret) ? ENOTSUP : 0;
3642 }
3643
3644 int
3645 task_importance_hold_internal_assertion(task_t target_task, uint32_t count)
3646 {
3647         ipc_importance_task_t task_imp;
3648         kern_return_t ret;
3649
3650         /* may be first time, so allow for possible importance setup */
3651         task_imp = ipc_importance_for_task(target_task, FALSE);
3652         if (IIT_NULL == task_imp) {
3653                 return EOVERFLOW;
3654         }
3655         ret = ipc_importance_task_hold_internal_assertion(task_imp, count);
3656         ipc_importance_task_release(task_imp);
3657
3658         return (KERN_SUCCESS != ret) ? ENOTSUP : 0;
3659 }
3660
3661 int
3662 task_importance_hold_file_lock_assertion(task_t target_task, uint32_t count)
3663 {
3664         ipc_importance_task_t task_imp;
3665         kern_return_t ret;
3666
3667         /* may be first time, so allow for possible importance setup */
3668         task_imp = ipc_importance_for_task(target_task, FALSE);
3669         if (IIT_NULL == task_imp) {
3670                 return EOVERFLOW;
3671         }
3672         ret = ipc_importance_task_hold_file_lock_assertion(task_imp, count);
3673         ipc_importance_task_release(task_imp);
3674
3675         return (KERN_SUCCESS != ret) ? ENOTSUP : 0;
3676 }
3677
3678 int
3679 task_importance_hold_legacy_external_assertion(task_t target_task, uint32_t count)
3680 {
3681         ipc_importance_task_t task_imp;
3682         kern_return_t ret;
3683
3684         /* must already have set up an importance */
3685         task_imp = target_task->task_imp_base;
3686         if (IIT_NULL == task_imp) {
3687                 return EOVERFLOW;
3688         }
3689         ret = ipc_importance_task_hold_legacy_external_assertion(task_imp, count);
3690         return (KERN_SUCCESS != ret) ? ENOTSUP : 0;
3691 }
3692
3693 int
3694 task_importance_drop_internal_assertion(task_t target_task, uint32_t count)
3695 {
3696         ipc_importance_task_t task_imp;
3697         kern_return_t ret;
3698
3699         /* must already have set up an importance */
3700         task_imp = target_task->task_imp_base;
3701         if (IIT_NULL == task_imp) {
3702                 return EOVERFLOW;
3703         }
3704         ret = ipc_importance_task_drop_internal_assertion(target_task->task_imp_base, count);
3705         return (KERN_SUCCESS != ret) ? ENOTSUP : 0;
3706 }
3707
3708 int
3709 task_importance_drop_file_lock_assertion(task_t target_task, uint32_t count)
3710 {
3711         ipc_importance_task_t task_imp;
3712         kern_return_t ret;
3713
3714         /* must already have set up an importance */
3715         task_imp = target_task->task_imp_base;
3716         if (IIT_NULL == task_imp) {
3717                 return EOVERFLOW;
3718         }
3719         ret = ipc_importance_task_drop_file_lock_assertion(target_task->task_imp_base, count);
3720         return (KERN_SUCCESS != ret) ? EOVERFLOW : 0;
3721 }
3722
3723 int
3724 task_importance_drop_legacy_external_assertion(task_t target_task, uint32_t count)
3725 {
3726         ipc_importance_task_t task_imp;
3727         kern_return_t ret;
3728
3729         /* must already have set up an importance */
3730         task_imp = target_task->task_imp_base;
3731         if (IIT_NULL == task_imp) {
3732                 return EOVERFLOW;
3733         }
3734         ret = ipc_importance_task_drop_legacy_external_assertion(task_imp, count);
3735         return (KERN_SUCCESS != ret) ? EOVERFLOW : 0;
3736 }
3737
3738 static void
3739 task_add_importance_watchport(task_t task, mach_port_t port, int *boostp)
3740 {
3741         int boost = 0;
3742
3743         __impdebug_only int released_pid = 0;
3744         __impdebug_only int pid = audit_token_pid_from_task(task);
3745
3746         ipc_importance_task_t release_imp_task = IIT_NULL;
3747
3748         if (IP_VALID(port) != 0) {
3749                 ipc_importance_task_t new_imp_task = ipc_importance_for_task(task, FALSE);
3750
3751                 ip_lock(port);
3752
3753                 /*
3754                  * The port must have been marked tempowner already.
3755                  * This also filters out ports whose receive rights
3756                  * are already enqueued in a message, as you can't
3757                  * change the right's destination once it's already
3758                  * on its way.
3759                  */
3760                 if (port->ip_tempowner != 0) {
3761                         assert(port->ip_impdonation != 0);
3762
3763                         boost = port->ip_impcount;
3764                         if (IIT_NULL != port->ip_imp_task) {
3765                                 /*
3766                                  * if this port is already bound to a task,
3767                                  * release the task reference and drop any
3768                                  * watchport-forwarded boosts
3769                                  */
3770                                 release_imp_task = port->ip_imp_task;
3771                                 port->ip_imp_task = IIT_NULL;
3772                         }
3773
3774                         /* mark the port is watching another task (reference held in port->ip_imp_task) */
3775                         if (ipc_importance_task_is_marked_receiver(new_imp_task)) {
3776                                 port->ip_imp_task = new_imp_task;
3777                                 new_imp_task = IIT_NULL;
3778                         }
3779                 }
3780                 ip_unlock(port);
3781
3782                 if (IIT_NULL != new_imp_task) {
3783                         ipc_importance_task_release(new_imp_task);
3784                 }
3785
3786                 if (IIT_NULL != release_imp_task) {
3787                         if (boost > 0)
3788                                 ipc_importance_task_drop_internal_assertion(release_imp_task, boost);
3789
3790                         // released_pid = audit_token_pid_from_task(release_imp_task); /* TODO: Need ref-safe way to get pid */
3791                         ipc_importance_task_release(release_imp_task);
3792                 }
3793 #if IMPORTANCE_DEBUG
3794                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (IMPORTANCE_CODE(IMP_WATCHPORT, 0)) | DBG_FUNC_NONE,
3795                         proc_selfpid(), pid, boost, released_pid, 0);
3796 #endif /* IMPORTANCE_DEBUG */
3797         }
3798
3799         *boostp = boost;
3800         return;
3801 }
3802
3803 #endif /* IMPORTANCE_INHERITANCE */
3804
3805 /*
3806  * Routines for VM to query task importance
3807  */
3808
3809
3810 /*
3811  * Order to be considered while estimating importance
3812  * for low memory notification and purging purgeable memory.
3813  */
3814 #define TASK_IMPORTANCE_FOREGROUND     4
3815 #define TASK_IMPORTANCE_NOTDARWINBG    1
3816
3817
3818 /*
3819  * Checks if the task is already notified.
3820  *
3821  * Condition: task lock should be held while calling this function.
3822  */
3823 boolean_t
3824 task_has_been_notified(task_t task, int pressurelevel)
3825 {
3826         if (task == NULL) {
3827                 return FALSE;
3828         }
3829
3830         if (pressurelevel == kVMPressureWarning)
3831                 return (task->low_mem_notified_warn ? TRUE : FALSE);
3832         else if (pressurelevel == kVMPressureCritical)
3833                 return (task->low_mem_notified_critical ? TRUE : FALSE);
3834         else
3835                 return TRUE;
3836 }
3837
3838
3839 /*
3840  * Checks if the task is used for purging.
3841  *
3842  * Condition: task lock should be held while calling this function.
3843  */
3844 boolean_t
3845 task_used_for_purging(task_t task, int pressurelevel)
3846 {
3847         if (task == NULL) {
3848                 return FALSE;
3849         }
3850
3851         if (pressurelevel == kVMPressureWarning)
3852                 return (task->purged_memory_warn ? TRUE : FALSE);
3853         else if (pressurelevel == kVMPressureCritical)
3854                 return (task->purged_memory_critical ? TRUE : FALSE);
3855         else
3856                 return TRUE;
3857 }
3858
3859
3860 /*
3861  * Mark the task as notified with memory notification.
3862  *
3863  * Condition: task lock should be held while calling this function.
3864  */
3865 void
3866 task_mark_has_been_notified(task_t task, int pressurelevel)
3867 {
3868         if (task == NULL) {
3869                 return;
3870         }
3871
3872         if (pressurelevel == kVMPressureWarning)
3873                 task->low_mem_notified_warn = 1;
3874         else if (pressurelevel == kVMPressureCritical)
3875                 task->low_mem_notified_critical = 1;
3876 }
3877
3878
3879 /*
3880  * Mark the task as purged.
3881  *
3882  * Condition: task lock should be held while calling this function.
3883  */
3884 void
3885 task_mark_used_for_purging(task_t task, int pressurelevel)
3886 {
3887         if (task == NULL) {
3888                 return;
3889         }
3890
3891         if (pressurelevel == kVMPressureWarning)
3892                 task->purged_memory_warn = 1;
3893         else if (pressurelevel == kVMPressureCritical)
3894                 task->purged_memory_critical = 1;
3895 }
3896
3897
3898 /*
3899  * Mark the task eligible for low memory notification.
3900  *
3901  * Condition: task lock should be held while calling this function.
3902  */
3903 void
3904 task_clear_has_been_notified(task_t task, int pressurelevel)
3905 {
3906         if (task == NULL) {
3907                 return;
3908         }
3909
3910         if (pressurelevel == kVMPressureWarning)
3911                 task->low_mem_notified_warn = 0;
3912         else if (pressurelevel == kVMPressureCritical)
3913                 task->low_mem_notified_critical = 0;
3914 }
3915
3916
3917 /*
3918  * Mark the task eligible for purging its purgeable memory.
3919  *
3920  * Condition: task lock should be held while calling this function.
3921  */
3922 void
3923 task_clear_used_for_purging(task_t task)
3924 {
3925         if (task == NULL) {
3926                 return;
3927         }
3928
3929         task->purged_memory_warn = 0;
3930         task->purged_memory_critical = 0;
3931 }
3932
3933
3934 /*
3935  * Estimate task importance for purging its purgeable memory
3936  * and low memory notification.
3937  *
3938  * Importance is calculated in the following order of criteria:
3939  * -Task role : Background vs Foreground
3940  * -Boost status: Not boosted vs Boosted
3941  * -Darwin BG status.
3942  *
3943  * Returns: Estimated task importance. Less important task will have lower
3944  *          estimated importance.
3945  */
3946 int
3947 task_importance_estimate(task_t task)
3948 {
3949         int task_importance = 0;
3950
3951         if (task == NULL) {
3952                 return 0;
3953         }
3954
3955         if (proc_get_effective_task_policy(task, TASK_POLICY_ROLE) == TASK_FOREGROUND_APPLICATION)
3956                         task_importance += TASK_IMPORTANCE_FOREGROUND;
3957
3958         if (proc_get_effective_task_policy(task, TASK_POLICY_DARWIN_BG) == 0)
3959                         task_importance += TASK_IMPORTANCE_NOTDARWINBG;
3960
3961         return task_importance;
3962 }
3963