osfmk/kern/thread_policy.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <mach/mach_types.h>
  30 #include <mach/thread_act_server.h>
  31
  32 #include <kern/kern_types.h>
  33 #include <kern/processor.h>
  34 #include <kern/thread.h>
  35 #include <kern/affinity.h>
  36 #include <mach/task_policy.h>
  37 #include <kern/sfi.h>
  38 #include <kern/policy_internal.h>
  39 #include <sys/errno.h>
  40 #include <sys/ulock.h>
  41
  42 #include <mach/machine/sdt.h>
  43
  44 #ifdef MACH_BSD
  45 extern int      proc_selfpid(void);
  46 extern char *   proc_name_address(void *p);
  47 extern void     rethrottle_thread(void * uthread);
  48 #endif /* MACH_BSD */
  49
  50 #define QOS_EXTRACT(q)        ((q) & 0xff)
  51
  52 uint32_t qos_override_mode;
  53 #define QOS_OVERRIDE_MODE_OVERHANG_PEAK 0
  54 #define QOS_OVERRIDE_MODE_IGNORE_OVERRIDE 1
  55 #define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE 2
  56 #define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH 3
  57 #define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE 4
  58
  59 extern zone_t thread_qos_override_zone;
  60
  61 static boolean_t
  62 proc_thread_qos_remove_override_internal(thread_t thread, user_addr_t resource, int resource_type, boolean_t reset, boolean_t squash);
  63
  64 /*
  65  * THREAD_QOS_UNSPECIFIED is assigned the highest tier available, so it does not provide a limit
  66  * to threads that don't have a QoS class set.
  67  */
  68 const qos_policy_params_t thread_qos_policy_params = {
  69         /*
  70          * This table defines the starting base priority of the thread,
  71          * which will be modified by the thread importance and the task max priority
  72          * before being applied.
  73          */
  74         .qos_pri[THREAD_QOS_UNSPECIFIED]                = 0, /* not consulted */
  75         .qos_pri[THREAD_QOS_USER_INTERACTIVE]           = BASEPRI_BACKGROUND, /* i.e. 46 */
  76         .qos_pri[THREAD_QOS_USER_INITIATED]             = BASEPRI_USER_INITIATED,
  77         .qos_pri[THREAD_QOS_LEGACY]                     = BASEPRI_DEFAULT,
  78         .qos_pri[THREAD_QOS_UTILITY]                    = BASEPRI_UTILITY,
  79         .qos_pri[THREAD_QOS_BACKGROUND]                 = MAXPRI_THROTTLE,
  80         .qos_pri[THREAD_QOS_MAINTENANCE]                = MAXPRI_THROTTLE,
  81
  82         /*
  83          * This table defines the highest IO priority that a thread marked with this
  84          * QoS class can have.
  85          */
  86         .qos_iotier[THREAD_QOS_UNSPECIFIED]             = THROTTLE_LEVEL_TIER0,
  87         .qos_iotier[THREAD_QOS_USER_INTERACTIVE]        = THROTTLE_LEVEL_TIER0,
  88         .qos_iotier[THREAD_QOS_USER_INITIATED]          = THROTTLE_LEVEL_TIER0,
  89         .qos_iotier[THREAD_QOS_LEGACY]                  = THROTTLE_LEVEL_TIER0,
  90         .qos_iotier[THREAD_QOS_UTILITY]                 = THROTTLE_LEVEL_TIER1,
  91         .qos_iotier[THREAD_QOS_BACKGROUND]              = THROTTLE_LEVEL_TIER2, /* possibly overridden by bg_iotier */
  92         .qos_iotier[THREAD_QOS_MAINTENANCE]             = THROTTLE_LEVEL_TIER3,
  93
  94         /*
  95          * This table defines the highest QoS level that
  96          * a thread marked with this QoS class can have.
  97          */
  98
  99         .qos_through_qos[THREAD_QOS_UNSPECIFIED]        = QOS_EXTRACT(THROUGHPUT_QOS_TIER_UNSPECIFIED),
 100         .qos_through_qos[THREAD_QOS_USER_INTERACTIVE]   = QOS_EXTRACT(THROUGHPUT_QOS_TIER_0),
 101         .qos_through_qos[THREAD_QOS_USER_INITIATED]     = QOS_EXTRACT(THROUGHPUT_QOS_TIER_1),
 102         .qos_through_qos[THREAD_QOS_LEGACY]             = QOS_EXTRACT(THROUGHPUT_QOS_TIER_1),
 103         .qos_through_qos[THREAD_QOS_UTILITY]            = QOS_EXTRACT(THROUGHPUT_QOS_TIER_2),
 104         .qos_through_qos[THREAD_QOS_BACKGROUND]         = QOS_EXTRACT(THROUGHPUT_QOS_TIER_5),
 105         .qos_through_qos[THREAD_QOS_MAINTENANCE]        = QOS_EXTRACT(THROUGHPUT_QOS_TIER_5),
 106
 107         .qos_latency_qos[THREAD_QOS_UNSPECIFIED]        = QOS_EXTRACT(LATENCY_QOS_TIER_UNSPECIFIED),
 108         .qos_latency_qos[THREAD_QOS_USER_INTERACTIVE]   = QOS_EXTRACT(LATENCY_QOS_TIER_0),
 109         .qos_latency_qos[THREAD_QOS_USER_INITIATED]     = QOS_EXTRACT(LATENCY_QOS_TIER_1),
 110         .qos_latency_qos[THREAD_QOS_LEGACY]             = QOS_EXTRACT(LATENCY_QOS_TIER_1),
 111         .qos_latency_qos[THREAD_QOS_UTILITY]            = QOS_EXTRACT(LATENCY_QOS_TIER_3),
 112         .qos_latency_qos[THREAD_QOS_BACKGROUND]         = QOS_EXTRACT(LATENCY_QOS_TIER_3),
 113         .qos_latency_qos[THREAD_QOS_MAINTENANCE]        = QOS_EXTRACT(LATENCY_QOS_TIER_3),
 114 };
 115
 116 static void
 117 thread_set_user_sched_mode_and_recompute_pri(thread_t thread, sched_mode_t mode);
 118
 119 static int
 120 thread_qos_scaled_relative_priority(int qos, int qos_relprio);
 121
 122 static void
 123 proc_get_thread_policy_bitfield(thread_t thread, thread_policy_state_t info);
 124
 125 static void
 126 proc_set_thread_policy_locked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token);
 127
 128 static void
 129 proc_set_thread_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token);
 130
 131 static void
 132 thread_set_requested_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2);
 133
 134 static int
 135 thread_get_requested_policy_spinlocked(thread_t thread, int category, int flavor, int* value2);
 136
 137 static int
 138 proc_get_thread_policy_locked(thread_t thread, int category, int flavor, int* value2);
 139
 140 static void
 141 thread_policy_update_spinlocked(thread_t thread, boolean_t recompute_priority, task_pend_token_t pend_token);
 142
 143 static void
 144 thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_priority, task_pend_token_t pend_token);
 145
 146 void
 147 thread_policy_init(void) {
 148         if (PE_parse_boot_argn("qos_override_mode", &qos_override_mode, sizeof(qos_override_mode))) {
 149                 printf("QOS override mode: 0x%08x\n", qos_override_mode);
 150         } else {
 151                 qos_override_mode = QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE;
 152         }
 153 }
 154
 155 boolean_t
 156 thread_has_qos_policy(thread_t thread) {
 157         return (proc_get_thread_policy(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS) != THREAD_QOS_UNSPECIFIED) ? TRUE : FALSE;
 158 }
 159
 160
 161 static void
 162 thread_remove_qos_policy_locked(thread_t thread,
 163                                 task_pend_token_t pend_token)
 164 {
 165
 166         __unused int prev_qos = thread->requested_policy.thrp_qos;
 167
 168         DTRACE_PROC2(qos__remove, thread_t, thread, int, prev_qos);
 169
 170         proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO,
 171                                       THREAD_QOS_UNSPECIFIED, 0, pend_token);
 172 }
 173
 174 kern_return_t
 175 thread_remove_qos_policy(thread_t thread)
 176 {
 177         struct task_pend_token pend_token = {};
 178
 179         thread_mtx_lock(thread);
 180         if (!thread->active) {
 181                 thread_mtx_unlock(thread);
 182                 return KERN_TERMINATED;
 183         }
 184
 185         thread_remove_qos_policy_locked(thread, &pend_token);
 186
 187         thread_mtx_unlock(thread);
 188
 189         thread_policy_update_complete_unlocked(thread, &pend_token);
 190
 191         return KERN_SUCCESS;
 192 }
 193
 194
 195 boolean_t
 196 thread_is_static_param(thread_t thread)
 197 {
 198         if (thread->static_param) {
 199                 DTRACE_PROC1(qos__legacy__denied, thread_t, thread);
 200                 return TRUE;
 201         }
 202         return FALSE;
 203 }
 204
 205 /*
 206  * Relative priorities can range between 0REL and -15REL. These
 207  * map to QoS-specific ranges, to create non-overlapping priority
 208  * ranges.
 209  */
 210 static int
 211 thread_qos_scaled_relative_priority(int qos, int qos_relprio)
 212 {
 213         int next_lower_qos;
 214
 215         /* Fast path, since no validation or scaling is needed */
 216         if (qos_relprio == 0) return 0;
 217
 218         switch (qos) {
 219                 case THREAD_QOS_USER_INTERACTIVE:
 220                         next_lower_qos = THREAD_QOS_USER_INITIATED;
 221                         break;
 222                 case THREAD_QOS_USER_INITIATED:
 223                         next_lower_qos = THREAD_QOS_LEGACY;
 224                         break;
 225                 case THREAD_QOS_LEGACY:
 226                         next_lower_qos = THREAD_QOS_UTILITY;
 227                         break;
 228                 case THREAD_QOS_UTILITY:
 229                         next_lower_qos = THREAD_QOS_BACKGROUND;
 230                         break;
 231                 case THREAD_QOS_MAINTENANCE:
 232                 case THREAD_QOS_BACKGROUND:
 233                         next_lower_qos = 0;
 234                         break;
 235                 default:
 236                         panic("Unrecognized QoS %d", qos);
 237                         return 0;
 238         }
 239
 240         int prio_range_max = thread_qos_policy_params.qos_pri[qos];
 241         int prio_range_min = next_lower_qos ? thread_qos_policy_params.qos_pri[next_lower_qos] : 0;
 242
 243         /*
 244          * We now have the valid range that the scaled relative priority can map to. Note
 245          * that the lower bound is exclusive, but the upper bound is inclusive. If the
 246          * range is (21,31], 0REL should map to 31 and -15REL should map to 22. We use the
 247          * fact that the max relative priority is -15 and use ">>4" to divide by 16 and discard
 248          * remainder.
 249          */
 250         int scaled_relprio = -(((prio_range_max - prio_range_min) * (-qos_relprio)) >> 4);
 251
 252         return scaled_relprio;
 253 }
 254
 255 /*
 256  * flag set by -qos-policy-allow boot-arg to allow
 257  * testing thread qos policy from userspace
 258  */
 259 boolean_t allow_qos_policy_set = FALSE;
 260
 261 kern_return_t
 262 thread_policy_set(
 263         thread_t                                thread,
 264         thread_policy_flavor_t  flavor,
 265         thread_policy_t                 policy_info,
 266         mach_msg_type_number_t  count)
 267 {
 268         thread_qos_policy_data_t req_qos;
 269         kern_return_t kr;
 270
 271         req_qos.qos_tier = THREAD_QOS_UNSPECIFIED;
 272
 273         if (thread == THREAD_NULL)
 274                 return (KERN_INVALID_ARGUMENT);
 275
 276         if (allow_qos_policy_set == FALSE) {
 277                 if (thread_is_static_param(thread))
 278                         return (KERN_POLICY_STATIC);
 279
 280                 if (flavor == THREAD_QOS_POLICY)
 281                         return (KERN_INVALID_ARGUMENT);
 282         }
 283
 284         /* Threads without static_param set reset their QoS when other policies are applied. */
 285         if (thread->requested_policy.thrp_qos != THREAD_QOS_UNSPECIFIED) {
 286                 /* Store the existing tier, if we fail this call it is used to reset back. */
 287                 req_qos.qos_tier = thread->requested_policy.thrp_qos;
 288                 req_qos.tier_importance = thread->requested_policy.thrp_qos_relprio;
 289
 290                 kr = thread_remove_qos_policy(thread);
 291                 if (kr != KERN_SUCCESS) {
 292                         return kr;
 293                 }
 294         }
 295
 296         kr = thread_policy_set_internal(thread, flavor, policy_info, count);
 297
 298         /* Return KERN_QOS_REMOVED instead of KERN_SUCCESS if we succeeded. */
 299         if (req_qos.qos_tier != THREAD_QOS_UNSPECIFIED) {
 300                 if (kr != KERN_SUCCESS) {
 301                         /* Reset back to our original tier as the set failed. */
 302                         (void)thread_policy_set_internal(thread, THREAD_QOS_POLICY, (thread_policy_t)&req_qos, THREAD_QOS_POLICY_COUNT);
 303                 }
 304         }
 305
 306         return kr;
 307 }
 308
 309 kern_return_t
 310 thread_policy_set_internal(
 311                            thread_t                     thread,
 312                            thread_policy_flavor_t       flavor,
 313                            thread_policy_t              policy_info,
 314                            mach_msg_type_number_t       count)
 315 {
 316         kern_return_t result = KERN_SUCCESS;
 317         struct task_pend_token pend_token = {};
 318
 319         thread_mtx_lock(thread);
 320         if (!thread->active) {
 321                 thread_mtx_unlock(thread);
 322
 323                 return (KERN_TERMINATED);
 324         }
 325
 326         switch (flavor) {
 327
 328         case THREAD_EXTENDED_POLICY:
 329         {
 330                 boolean_t timeshare = TRUE;
 331
 332                 if (count >= THREAD_EXTENDED_POLICY_COUNT) {
 333                         thread_extended_policy_t info;
 334
 335                         info = (thread_extended_policy_t)policy_info;
 336                         timeshare = info->timeshare;
 337                 }
 338
 339                 sched_mode_t mode = (timeshare == TRUE) ? TH_MODE_TIMESHARE : TH_MODE_FIXED;
 340
 341                 spl_t s = splsched();
 342                 thread_lock(thread);
 343
 344                 thread_set_user_sched_mode_and_recompute_pri(thread, mode);
 345
 346                 thread_unlock(thread);
 347                 splx(s);
 348
 349                 pend_token.tpt_update_thread_sfi = 1;
 350
 351                 break;
 352         }
 353
 354         case THREAD_TIME_CONSTRAINT_POLICY:
 355         {
 356                 thread_time_constraint_policy_t info;
 357
 358                 if (count < THREAD_TIME_CONSTRAINT_POLICY_COUNT) {
 359                         result = KERN_INVALID_ARGUMENT;
 360                         break;
 361                 }
 362
 363                 info = (thread_time_constraint_policy_t)policy_info;
 364                 if (info->constraint  < info->computation   ||
 365                     info->computation > max_rt_quantum      ||
 366                     info->computation < min_rt_quantum      ) {
 367                         result = KERN_INVALID_ARGUMENT;
 368                         break;
 369                 }
 370
 371                 spl_t s = splsched();
 372                 thread_lock(thread);
 373
 374                 thread->realtime.period         = info->period;
 375                 thread->realtime.computation    = info->computation;
 376                 thread->realtime.constraint     = info->constraint;
 377                 thread->realtime.preemptible    = info->preemptible;
 378
 379                 thread_set_user_sched_mode_and_recompute_pri(thread, TH_MODE_REALTIME);
 380
 381                 thread_unlock(thread);
 382                 splx(s);
 383
 384                 pend_token.tpt_update_thread_sfi = 1;
 385
 386                 break;
 387         }
 388
 389         case THREAD_PRECEDENCE_POLICY:
 390         {
 391                 thread_precedence_policy_t info;
 392
 393                 if (count < THREAD_PRECEDENCE_POLICY_COUNT) {
 394                         result = KERN_INVALID_ARGUMENT;
 395                         break;
 396                 }
 397                 info = (thread_precedence_policy_t)policy_info;
 398
 399                 spl_t s = splsched();
 400                 thread_lock(thread);
 401
 402                 thread->importance = info->importance;
 403
 404                 thread_recompute_priority(thread);
 405
 406                 thread_unlock(thread);
 407                 splx(s);
 408
 409                 break;
 410         }
 411
 412         case THREAD_AFFINITY_POLICY:
 413         {
 414                 thread_affinity_policy_t info;
 415
 416                 if (!thread_affinity_is_supported()) {
 417                         result = KERN_NOT_SUPPORTED;
 418                         break;
 419                 }
 420                 if (count < THREAD_AFFINITY_POLICY_COUNT) {
 421                         result = KERN_INVALID_ARGUMENT;
 422                         break;
 423                 }
 424
 425                 info = (thread_affinity_policy_t) policy_info;
 426                 /*
 427                  * Unlock the thread mutex here and
 428                  * return directly after calling thread_affinity_set().
 429                  * This is necessary for correct lock ordering because
 430                  * thread_affinity_set() takes the task lock.
 431                  */
 432                 thread_mtx_unlock(thread);
 433                 return thread_affinity_set(thread, info->affinity_tag);
 434         }
 435
 436
 437         case THREAD_THROUGHPUT_QOS_POLICY:
 438         {
 439                 thread_throughput_qos_policy_t info = (thread_throughput_qos_policy_t) policy_info;
 440                 thread_throughput_qos_t tqos;
 441
 442                 if (count < THREAD_THROUGHPUT_QOS_POLICY_COUNT) {
 443                         result = KERN_INVALID_ARGUMENT;
 444                         break;
 445                 }
 446
 447                 if ((result = qos_throughput_policy_validate(info->thread_throughput_qos_tier)) != KERN_SUCCESS)
 448                         break;
 449
 450                 tqos = qos_extract(info->thread_throughput_qos_tier);
 451
 452                 proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
 453                                               TASK_POLICY_THROUGH_QOS, tqos, 0, &pend_token);
 454
 455                 break;
 456         }
 457
 458         case THREAD_LATENCY_QOS_POLICY:
 459         {
 460                 thread_latency_qos_policy_t info = (thread_latency_qos_policy_t) policy_info;
 461                 thread_latency_qos_t lqos;
 462
 463                 if (count < THREAD_LATENCY_QOS_POLICY_COUNT) {
 464                         result = KERN_INVALID_ARGUMENT;
 465                         break;
 466                 }
 467
 468                 if ((result = qos_latency_policy_validate(info->thread_latency_qos_tier)) != KERN_SUCCESS)
 469                         break;
 470
 471                 lqos = qos_extract(info->thread_latency_qos_tier);
 472
 473                 proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
 474                                               TASK_POLICY_LATENCY_QOS, lqos, 0, &pend_token);
 475
 476                 break;
 477         }
 478
 479         case THREAD_QOS_POLICY:
 480         {
 481                 thread_qos_policy_t info = (thread_qos_policy_t)policy_info;
 482
 483                 if (count < THREAD_QOS_POLICY_COUNT) {
 484                         result = KERN_INVALID_ARGUMENT;
 485                         break;
 486                 }
 487
 488                 if (info->qos_tier < 0 || info->qos_tier >= THREAD_QOS_LAST) {
 489                         result = KERN_INVALID_ARGUMENT;
 490                         break;
 491                 }
 492
 493                 if (info->tier_importance > 0 || info->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
 494                         result = KERN_INVALID_ARGUMENT;
 495                         break;
 496                 }
 497
 498                 if (info->qos_tier == THREAD_QOS_UNSPECIFIED && info->tier_importance != 0) {
 499                         result = KERN_INVALID_ARGUMENT;
 500                         break;
 501                 }
 502
 503                 proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO,
 504                                               info->qos_tier, -info->tier_importance, &pend_token);
 505
 506                 break;
 507         }
 508
 509         default:
 510                 result = KERN_INVALID_ARGUMENT;
 511                 break;
 512         }
 513
 514         thread_mtx_unlock(thread);
 515
 516         thread_policy_update_complete_unlocked(thread, &pend_token);
 517
 518         return (result);
 519 }
 520
 521 /*
 522  * Note that there is no implemented difference between POLICY_RR and POLICY_FIFO.
 523  * Both result in FIXED mode scheduling.
 524  */
 525 static sched_mode_t
 526 convert_policy_to_sched_mode(integer_t policy) {
 527         switch (policy) {
 528                 case POLICY_TIMESHARE:
 529                         return TH_MODE_TIMESHARE;
 530                 case POLICY_RR:
 531                 case POLICY_FIFO:
 532                         return TH_MODE_FIXED;
 533                 default:
 534                         panic("unexpected sched policy: %d", policy);
 535                         return TH_MODE_NONE;
 536         }
 537 }
 538
 539 /*
 540  * Called either with the thread mutex locked
 541  * or from the pthread kext in a 'safe place'.
 542  */
 543 static kern_return_t
 544 thread_set_mode_and_absolute_pri_internal(thread_t              thread,
 545                                           sched_mode_t          mode,
 546                                           integer_t             priority,
 547                                           task_pend_token_t     pend_token)
 548 {
 549         kern_return_t kr = KERN_SUCCESS;
 550
 551         spl_t s = splsched();
 552         thread_lock(thread);
 553
 554         /* This path isn't allowed to change a thread out of realtime. */
 555         if ((thread->sched_mode == TH_MODE_REALTIME) ||
 556             (thread->saved_mode == TH_MODE_REALTIME)) {
 557                 kr = KERN_FAILURE;
 558                 goto unlock;
 559         }
 560
 561         if (thread->policy_reset) {
 562                 kr = KERN_SUCCESS;
 563                 goto unlock;
 564         }
 565
 566         sched_mode_t old_mode = thread->sched_mode;
 567
 568         /*
 569          * Reverse engineer and apply the correct importance value
 570          * from the requested absolute priority value.
 571          *
 572          * TODO: Store the absolute priority value instead
 573          */
 574
 575         if (priority >= thread->max_priority)
 576                 priority = thread->max_priority - thread->task_priority;
 577         else if (priority >= MINPRI_KERNEL)
 578                 priority -=  MINPRI_KERNEL;
 579         else if (priority >= MINPRI_RESERVED)
 580                 priority -=  MINPRI_RESERVED;
 581         else
 582                 priority -= BASEPRI_DEFAULT;
 583
 584         priority += thread->task_priority;
 585
 586         if (priority > thread->max_priority)
 587                 priority = thread->max_priority;
 588         else if (priority < MINPRI)
 589                 priority = MINPRI;
 590
 591         thread->importance = priority - thread->task_priority;
 592
 593         thread_set_user_sched_mode_and_recompute_pri(thread, mode);
 594
 595         if (mode != old_mode)
 596                 pend_token->tpt_update_thread_sfi = 1;
 597
 598 unlock:
 599         thread_unlock(thread);
 600         splx(s);
 601
 602         return kr;
 603 }
 604
 605 /*
 606  * KPI for pthread kext
 607  *
 608  * Set scheduling policy & absolute priority for thread
 609  * May be called from waitqueue callout context with spinlocks held
 610  * Thread mutex lock is not held
 611  */
 612 kern_return_t
 613 thread_set_workq_pri(thread_t  thread,
 614                      integer_t priority,
 615                      integer_t policy)
 616 {
 617         struct task_pend_token pend_token = {};
 618         sched_mode_t mode = convert_policy_to_sched_mode(policy);
 619
 620         assert(thread->static_param);
 621         if (!thread->static_param)
 622                 return KERN_FAILURE;
 623
 624         /* Concern: this doesn't hold the mutex... */
 625         if (!thread->active)
 626                 return KERN_TERMINATED;
 627
 628         kern_return_t kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority, &pend_token);
 629
 630         if (pend_token.tpt_update_thread_sfi)
 631                 sfi_reevaluate(thread);
 632
 633         return kr;
 634 }
 635
 636 /*
 637  * thread_set_mode_and_absolute_pri:
 638  *
 639  * Set scheduling policy & absolute priority for thread, for deprecated
 640  * thread_set_policy and thread_policy interfaces.
 641  *
 642  * Called with nothing locked.
 643  */
 644 kern_return_t
 645 thread_set_mode_and_absolute_pri(thread_t   thread,
 646                                  integer_t  policy,
 647                                  integer_t  priority)
 648 {
 649         kern_return_t kr = KERN_SUCCESS;
 650         struct task_pend_token pend_token = {};
 651
 652         sched_mode_t mode = convert_policy_to_sched_mode(policy);
 653
 654         thread_mtx_lock(thread);
 655
 656         if (!thread->active) {
 657                 kr = KERN_TERMINATED;
 658                 goto unlock;
 659         }
 660
 661         if (thread_is_static_param(thread)) {
 662                 kr = KERN_POLICY_STATIC;
 663                 goto unlock;
 664         }
 665
 666         /* Setting legacy policies on threads kills the current QoS */
 667         if (thread->requested_policy.thrp_qos != THREAD_QOS_UNSPECIFIED)
 668                 thread_remove_qos_policy_locked(thread, &pend_token);
 669
 670         kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority, &pend_token);
 671
 672 unlock:
 673         thread_mtx_unlock(thread);
 674
 675         thread_policy_update_complete_unlocked(thread, &pend_token);
 676
 677         return (kr);
 678 }
 679
 680 /*
 681  * Set the thread's requested mode and recompute priority
 682  * Called with thread mutex and thread locked
 683  *
 684  * TODO: Mitigate potential problems caused by moving thread to end of runq
 685  * whenever its priority is recomputed
 686  *      Only remove when it actually changes? Attempt to re-insert at appropriate location?
 687  */
 688 static void
 689 thread_set_user_sched_mode_and_recompute_pri(thread_t thread, sched_mode_t mode)
 690 {
 691         if (thread->policy_reset)
 692                 return;
 693
 694         boolean_t removed = thread_run_queue_remove(thread);
 695
 696         /*
 697          * TODO: Instead of having saved mode, have 'user mode' and 'true mode'.
 698          * That way there's zero confusion over which the user wants
 699          * and which the kernel wants.
 700          */
 701         if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK)
 702                 thread->saved_mode = mode;
 703         else
 704                 sched_set_thread_mode(thread, mode);
 705
 706         thread_recompute_priority(thread);
 707
 708         if (removed)
 709                 thread_run_queue_reinsert(thread, SCHED_TAILQ);
 710 }
 711
 712 /* called at splsched with thread lock locked */
 713 static void
 714 thread_update_qos_cpu_time_locked(thread_t thread)
 715 {
 716         task_t task = thread->task;
 717         uint64_t timer_sum, timer_delta;
 718
 719         /*
 720          * This is only as accurate as the distance between
 721          * last context switch (embedded) or last user/kernel boundary transition (desktop)
 722          * because user_timer and system_timer are only updated then.
 723          *
 724          * TODO: Consider running a thread_timer_event operation here to update it first.
 725          *       Maybe doable with interrupts disabled from current thread.
 726          *       If the thread is on a different core, may not be easy to get right.
 727          *
 728          * TODO: There should be a function for this in timer.c
 729          */
 730
 731         timer_sum = timer_grab(&thread->user_timer);
 732         timer_sum += timer_grab(&thread->system_timer);
 733         timer_delta = timer_sum - thread->vtimer_qos_save;
 734
 735         thread->vtimer_qos_save = timer_sum;
 736
 737         uint64_t* task_counter = NULL;
 738
 739         /* Update the task-level qos stats atomically, because we don't have the task lock. */
 740         switch (thread->effective_policy.thep_qos) {
 741                 case THREAD_QOS_DEFAULT:            task_counter = &task->cpu_time_qos_stats.cpu_time_qos_default;          break;
 742                 case THREAD_QOS_MAINTENANCE:        task_counter = &task->cpu_time_qos_stats.cpu_time_qos_maintenance;      break;
 743                 case THREAD_QOS_BACKGROUND:         task_counter = &task->cpu_time_qos_stats.cpu_time_qos_background;       break;
 744                 case THREAD_QOS_UTILITY:            task_counter = &task->cpu_time_qos_stats.cpu_time_qos_utility;          break;
 745                 case THREAD_QOS_LEGACY:             task_counter = &task->cpu_time_qos_stats.cpu_time_qos_legacy;           break;
 746                 case THREAD_QOS_USER_INITIATED:     task_counter = &task->cpu_time_qos_stats.cpu_time_qos_user_initiated;   break;
 747                 case THREAD_QOS_USER_INTERACTIVE:   task_counter = &task->cpu_time_qos_stats.cpu_time_qos_user_interactive; break;
 748                 default:
 749                         panic("unknown effective QoS: %d", thread->effective_policy.thep_qos);
 750         }
 751
 752         OSAddAtomic64(timer_delta, task_counter);
 753 }
 754
 755 /*
 756  * called with no thread locks held
 757  * may hold task lock
 758  */
 759 void
 760 thread_update_qos_cpu_time(thread_t thread)
 761 {
 762         thread_mtx_lock(thread);
 763
 764         spl_t s = splsched();
 765         thread_lock(thread);
 766
 767         thread_update_qos_cpu_time_locked(thread);
 768
 769         thread_unlock(thread);
 770         splx(s);
 771
 772         thread_mtx_unlock(thread);
 773 }
 774
 775 /*
 776  * Calculate base priority from thread attributes, and set it on the thread
 777  *
 778  * Called with thread_lock and thread mutex held.
 779  */
 780 void
 781 thread_recompute_priority(
 782         thread_t                thread)
 783 {
 784         integer_t               priority;
 785
 786         if (thread->policy_reset)
 787                 return;
 788
 789         if (thread->sched_mode == TH_MODE_REALTIME) {
 790                 sched_set_thread_base_priority(thread, BASEPRI_RTQUEUES);
 791                 return;
 792         } else if (thread->effective_policy.thep_qos != THREAD_QOS_UNSPECIFIED) {
 793                 int qos = thread->effective_policy.thep_qos;
 794                 int qos_ui_is_urgent = thread->effective_policy.thep_qos_ui_is_urgent;
 795                 int qos_relprio = -(thread->effective_policy.thep_qos_relprio); /* stored in task policy inverted */
 796                 int qos_scaled_relprio;
 797
 798                 assert(qos >= 0 && qos < THREAD_QOS_LAST);
 799                 assert(qos_relprio <= 0 && qos_relprio >= THREAD_QOS_MIN_TIER_IMPORTANCE);
 800
 801                 priority = thread_qos_policy_params.qos_pri[qos];
 802                 qos_scaled_relprio = thread_qos_scaled_relative_priority(qos, qos_relprio);
 803
 804                 if (qos == THREAD_QOS_USER_INTERACTIVE && qos_ui_is_urgent == 1) {
 805                         /* Bump priority 46 to 47 when in a frontmost app */
 806                         qos_scaled_relprio += 1;
 807                 }
 808
 809                 /* TODO: factor in renice priority here? */
 810
 811                 priority += qos_scaled_relprio;
 812         } else {
 813                 if (thread->importance > MAXPRI)
 814                         priority = MAXPRI;
 815                 else if (thread->importance < -MAXPRI)
 816                         priority = -MAXPRI;
 817                 else
 818                         priority = thread->importance;
 819
 820                 priority += thread->task_priority;
 821         }
 822
 823         priority = MAX(priority, thread->user_promotion_basepri);
 824
 825         /*
 826          * Clamp priority back into the allowed range for this task.
 827          *  The initial priority value could be out of this range due to:
 828          *      Task clamped to BG or Utility (max-pri is 4, or 20)
 829          *      Task is user task (max-pri is 63)
 830          *      Task is kernel task (max-pri is 95)
 831          * Note that thread->importance is user-settable to any integer
 832          * via THREAD_PRECEDENCE_POLICY.
 833          */
 834         if (priority > thread->max_priority)
 835                 priority = thread->max_priority;
 836         else if (priority < MINPRI)
 837                 priority = MINPRI;
 838
 839         if (thread->saved_mode == TH_MODE_REALTIME &&
 840             thread->sched_flags & TH_SFLAG_FAILSAFE)
 841                 priority = DEPRESSPRI;
 842
 843         if (thread->effective_policy.thep_terminated == TRUE) {
 844                 /*
 845                  * We temporarily want to override the expected priority to
 846                  * ensure that the thread exits in a timely manner.
 847                  * Note that this is allowed to exceed thread->max_priority
 848                  * so that the thread is no longer clamped to background
 849                  * during the final exit phase.
 850                  */
 851                 if (priority < thread->task_priority)
 852                         priority = thread->task_priority;
 853                 if (priority < BASEPRI_DEFAULT)
 854                         priority = BASEPRI_DEFAULT;
 855         }
 856
 857
 858         sched_set_thread_base_priority(thread, priority);
 859 }
 860
 861 /* Called with the task lock held, but not the thread mutex or spinlock */
 862 void
 863 thread_policy_update_tasklocked(
 864                                 thread_t           thread,
 865                                 integer_t          priority,
 866                                 integer_t          max_priority,
 867                                 task_pend_token_t  pend_token)
 868 {
 869         thread_mtx_lock(thread);
 870
 871         if (!thread->active || thread->policy_reset) {
 872                 thread_mtx_unlock(thread);
 873                 return;
 874         }
 875
 876         spl_t s = splsched();
 877         thread_lock(thread);
 878
 879         __unused
 880         integer_t old_max_priority = thread->max_priority;
 881
 882         thread->task_priority = priority;
 883         thread->max_priority = max_priority;
 884
 885
 886         thread_policy_update_spinlocked(thread, TRUE, pend_token);
 887
 888         thread_unlock(thread);
 889         splx(s);
 890
 891         thread_mtx_unlock(thread);
 892 }
 893
 894 /*
 895  * Reset thread to default state in preparation for termination
 896  * Called with thread mutex locked
 897  *
 898  * Always called on current thread, so we don't need a run queue remove
 899  */
 900 void
 901 thread_policy_reset(
 902         thread_t                thread)
 903 {
 904         spl_t           s;
 905
 906         assert(thread == current_thread());
 907
 908         s = splsched();
 909         thread_lock(thread);
 910
 911         if (thread->sched_flags & TH_SFLAG_FAILSAFE)
 912                 sched_thread_mode_undemote(thread, TH_SFLAG_FAILSAFE);
 913
 914         if (thread->sched_flags & TH_SFLAG_THROTTLED)
 915                 sched_thread_mode_undemote(thread, TH_SFLAG_THROTTLED);
 916
 917         /* At this point, the various demotions should be inactive */
 918         assert(!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK));
 919         assert(!(thread->sched_flags & TH_SFLAG_THROTTLED));
 920         assert(!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK));
 921
 922         /* Reset thread back to task-default basepri and mode  */
 923         sched_mode_t newmode = SCHED(initial_thread_sched_mode)(thread->task);
 924
 925         sched_set_thread_mode(thread, newmode);
 926
 927         thread->importance = 0;
 928
 929         /* Prevent further changes to thread base priority or mode */
 930         thread->policy_reset = 1;
 931
 932         sched_set_thread_base_priority(thread, thread->task_priority);
 933
 934         thread_unlock(thread);
 935         splx(s);
 936 }
 937
 938 kern_return_t
 939 thread_policy_get(
 940         thread_t                                thread,
 941         thread_policy_flavor_t  flavor,
 942         thread_policy_t                 policy_info,
 943         mach_msg_type_number_t  *count,
 944         boolean_t                               *get_default)
 945 {
 946         kern_return_t                   result = KERN_SUCCESS;
 947
 948         if (thread == THREAD_NULL)
 949                 return (KERN_INVALID_ARGUMENT);
 950
 951         thread_mtx_lock(thread);
 952         if (!thread->active) {
 953                 thread_mtx_unlock(thread);
 954
 955                 return (KERN_TERMINATED);
 956         }
 957
 958         switch (flavor) {
 959
 960         case THREAD_EXTENDED_POLICY:
 961         {
 962                 boolean_t               timeshare = TRUE;
 963
 964                 if (!(*get_default)) {
 965                         spl_t s = splsched();
 966                         thread_lock(thread);
 967
 968                         if (     (thread->sched_mode != TH_MODE_REALTIME)       &&
 969                                          (thread->saved_mode != TH_MODE_REALTIME)                       ) {
 970                                 if (!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK))
 971                                         timeshare = (thread->sched_mode == TH_MODE_TIMESHARE) != 0;
 972                                 else
 973                                         timeshare = (thread->saved_mode == TH_MODE_TIMESHARE) != 0;
 974                         }
 975                         else
 976                                 *get_default = TRUE;
 977
 978                         thread_unlock(thread);
 979                         splx(s);
 980                 }
 981
 982                 if (*count >= THREAD_EXTENDED_POLICY_COUNT) {
 983                         thread_extended_policy_t        info;
 984
 985                         info = (thread_extended_policy_t)policy_info;
 986                         info->timeshare = timeshare;
 987                 }
 988
 989                 break;
 990         }
 991
 992         case THREAD_TIME_CONSTRAINT_POLICY:
 993         {
 994                 thread_time_constraint_policy_t         info;
 995
 996                 if (*count < THREAD_TIME_CONSTRAINT_POLICY_COUNT) {
 997                         result = KERN_INVALID_ARGUMENT;
 998                         break;
 999                 }
1000
1001                 info = (thread_time_constraint_policy_t)policy_info;
1002
1003                 if (!(*get_default)) {
1004                         spl_t s = splsched();
1005                         thread_lock(thread);
1006
1007                         if (    (thread->sched_mode == TH_MODE_REALTIME)        ||
1008                                         (thread->saved_mode == TH_MODE_REALTIME)                ) {
1009                                 info->period = thread->realtime.period;
1010                                 info->computation = thread->realtime.computation;
1011                                 info->constraint = thread->realtime.constraint;
1012                                 info->preemptible = thread->realtime.preemptible;
1013                         }
1014                         else
1015                                 *get_default = TRUE;
1016
1017                         thread_unlock(thread);
1018                         splx(s);
1019                 }
1020
1021                 if (*get_default) {
1022                         info->period = 0;
1023                         info->computation = default_timeshare_computation;
1024                         info->constraint = default_timeshare_constraint;
1025                         info->preemptible = TRUE;
1026                 }
1027
1028                 break;
1029         }
1030
1031         case THREAD_PRECEDENCE_POLICY:
1032         {
1033                 thread_precedence_policy_t              info;
1034
1035                 if (*count < THREAD_PRECEDENCE_POLICY_COUNT) {
1036                         result = KERN_INVALID_ARGUMENT;
1037                         break;
1038                 }
1039
1040                 info = (thread_precedence_policy_t)policy_info;
1041
1042                 if (!(*get_default)) {
1043                         spl_t s = splsched();
1044                         thread_lock(thread);
1045
1046                         info->importance = thread->importance;
1047
1048                         thread_unlock(thread);
1049                         splx(s);
1050                 }
1051                 else
1052                         info->importance = 0;
1053
1054                 break;
1055         }
1056
1057         case THREAD_AFFINITY_POLICY:
1058         {
1059                 thread_affinity_policy_t                info;
1060
1061                 if (!thread_affinity_is_supported()) {
1062                         result = KERN_NOT_SUPPORTED;
1063                         break;
1064                 }
1065                 if (*count < THREAD_AFFINITY_POLICY_COUNT) {
1066                         result = KERN_INVALID_ARGUMENT;
1067                         break;
1068                 }
1069
1070                 info = (thread_affinity_policy_t)policy_info;
1071
1072                 if (!(*get_default))
1073                         info->affinity_tag = thread_affinity_get(thread);
1074                 else
1075                         info->affinity_tag = THREAD_AFFINITY_TAG_NULL;
1076
1077                 break;
1078         }
1079
1080         case THREAD_POLICY_STATE:
1081         {
1082                 thread_policy_state_t           info;
1083
1084                 if (*count < THREAD_POLICY_STATE_COUNT) {
1085                         result = KERN_INVALID_ARGUMENT;
1086                         break;
1087                 }
1088
1089                 /* Only root can get this info */
1090                 if (current_task()->sec_token.val[0] != 0) {
1091                         result = KERN_PROTECTION_FAILURE;
1092                         break;
1093                 }
1094
1095                 info = (thread_policy_state_t)(void*)policy_info;
1096
1097                 if (!(*get_default)) {
1098                         info->flags = 0;
1099
1100                         spl_t s = splsched();
1101                         thread_lock(thread);
1102
1103                         info->flags |= (thread->static_param ? THREAD_POLICY_STATE_FLAG_STATIC_PARAM : 0);
1104
1105                         info->thps_requested_policy = *(uint64_t*)(void*)(&thread->requested_policy);
1106                         info->thps_effective_policy = *(uint64_t*)(void*)(&thread->effective_policy);
1107
1108                         info->thps_user_promotions          = thread->user_promotions;
1109                         info->thps_user_promotion_basepri   = thread->user_promotion_basepri;
1110                         info->thps_ipc_overrides            = thread->ipc_overrides;
1111
1112                         proc_get_thread_policy_bitfield(thread, info);
1113
1114                         thread_unlock(thread);
1115                         splx(s);
1116                 } else {
1117                         info->requested = 0;
1118                         info->effective = 0;
1119                         info->pending = 0;
1120                 }
1121
1122                 break;
1123         }
1124
1125         case THREAD_LATENCY_QOS_POLICY:
1126         {
1127                 thread_latency_qos_policy_t info = (thread_latency_qos_policy_t) policy_info;
1128                 thread_latency_qos_t plqos;
1129
1130                 if (*count < THREAD_LATENCY_QOS_POLICY_COUNT) {
1131                         result = KERN_INVALID_ARGUMENT;
1132                         break;
1133                 }
1134
1135                 if (*get_default) {
1136                         plqos = 0;
1137                 } else {
1138                         plqos = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_LATENCY_QOS, NULL);
1139                 }
1140
1141                 info->thread_latency_qos_tier = qos_latency_policy_package(plqos);
1142         }
1143         break;
1144
1145         case THREAD_THROUGHPUT_QOS_POLICY:
1146         {
1147                 thread_throughput_qos_policy_t info = (thread_throughput_qos_policy_t) policy_info;
1148                 thread_throughput_qos_t ptqos;
1149
1150                 if (*count < THREAD_THROUGHPUT_QOS_POLICY_COUNT) {
1151                         result = KERN_INVALID_ARGUMENT;
1152                         break;
1153                 }
1154
1155                 if (*get_default) {
1156                         ptqos = 0;
1157                 } else {
1158                         ptqos = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_THROUGH_QOS, NULL);
1159                 }
1160
1161                 info->thread_throughput_qos_tier = qos_throughput_policy_package(ptqos);
1162         }
1163         break;
1164
1165         case THREAD_QOS_POLICY:
1166         {
1167                 thread_qos_policy_t info = (thread_qos_policy_t)policy_info;
1168
1169                 if (*count < THREAD_QOS_POLICY_COUNT) {
1170                         result = KERN_INVALID_ARGUMENT;
1171                         break;
1172                 }
1173
1174                 if (!(*get_default)) {
1175                         int relprio_value = 0;
1176                         info->qos_tier = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
1177                                                                        TASK_POLICY_QOS_AND_RELPRIO, &relprio_value);
1178
1179                         info->tier_importance = -relprio_value;
1180                 } else {
1181                         info->qos_tier = THREAD_QOS_UNSPECIFIED;
1182                         info->tier_importance = 0;
1183                 }
1184
1185                 break;
1186         }
1187
1188         default:
1189                 result = KERN_INVALID_ARGUMENT;
1190                 break;
1191         }
1192
1193         thread_mtx_unlock(thread);
1194
1195         return (result);
1196 }
1197
1198 static volatile uint64_t unique_work_interval_id = 1; /* Start at 1, 0 is not a valid work interval ID */
1199
1200 kern_return_t
1201 thread_policy_create_work_interval(
1202         thread_t                thread,
1203         uint64_t                *work_interval_id)
1204 {
1205         thread_mtx_lock(thread);
1206         if (thread->work_interval_id) {
1207                 /* already assigned a work interval ID */
1208                 thread_mtx_unlock(thread);
1209                 return (KERN_INVALID_VALUE);
1210         }
1211
1212         thread->work_interval_id = OSIncrementAtomic64((volatile int64_t *)&unique_work_interval_id);
1213         *work_interval_id = thread->work_interval_id;
1214
1215         thread_mtx_unlock(thread);
1216         return KERN_SUCCESS;
1217 }
1218
1219 kern_return_t
1220 thread_policy_destroy_work_interval(
1221         thread_t                thread,
1222         uint64_t                work_interval_id)
1223 {
1224         thread_mtx_lock(thread);
1225         if (work_interval_id == 0 || thread->work_interval_id == 0 || thread->work_interval_id != work_interval_id) {
1226                 /* work ID isn't valid or doesn't match previously assigned work interval ID */
1227                 thread_mtx_unlock(thread);
1228                 return (KERN_INVALID_ARGUMENT);
1229         }
1230
1231         thread->work_interval_id = 0;
1232
1233         thread_mtx_unlock(thread);
1234         return KERN_SUCCESS;
1235 }
1236
1237 void
1238 thread_policy_create(thread_t thread)
1239 {
1240         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1241                                   (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_THREAD))) | DBG_FUNC_START,
1242                                   thread_tid(thread), theffective_0(thread),
1243                                   theffective_1(thread), thread->base_pri, 0);
1244
1245         /* We pass a pend token but ignore it */
1246         struct task_pend_token pend_token = {};
1247
1248         thread_policy_update_internal_spinlocked(thread, TRUE, &pend_token);
1249
1250         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1251                                   (IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE | TASK_POLICY_THREAD))) | DBG_FUNC_END,
1252                                   thread_tid(thread), theffective_0(thread),
1253                                   theffective_1(thread), thread->base_pri, 0);
1254 }
1255
1256 static void
1257 thread_policy_update_spinlocked(thread_t thread, boolean_t recompute_priority, task_pend_token_t pend_token)
1258 {
1259         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1260                                   (IMPORTANCE_CODE(IMP_UPDATE, TASK_POLICY_THREAD) | DBG_FUNC_START),
1261                                   thread_tid(thread), theffective_0(thread),
1262                                   theffective_1(thread), thread->base_pri, 0);
1263
1264         thread_policy_update_internal_spinlocked(thread, recompute_priority, pend_token);
1265
1266         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1267                                   (IMPORTANCE_CODE(IMP_UPDATE, TASK_POLICY_THREAD)) | DBG_FUNC_END,
1268                                   thread_tid(thread), theffective_0(thread),
1269                                   theffective_1(thread), thread->base_pri, 0);
1270 }
1271
1272
1273
1274 /*
1275  * One thread state update function TO RULE THEM ALL
1276  *
1277  * This function updates the thread effective policy fields
1278  * and pushes the results to the relevant subsystems.
1279  *
1280  * Returns TRUE if a pended action needs to be run.
1281  *
1282  * Called with thread spinlock locked, task may be locked, thread mutex may be locked
1283  */
1284 static void
1285 thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_priority,
1286                                          task_pend_token_t pend_token)
1287 {
1288         /*
1289          * Step 1:
1290          *  Gather requested policy and effective task state
1291          */
1292
1293         struct thread_requested_policy requested = thread->requested_policy;
1294         struct task_effective_policy task_effective = thread->task->effective_policy;
1295
1296         /*
1297          * Step 2:
1298          *  Calculate new effective policies from requested policy, task and thread state
1299          *  Rules:
1300          *      Don't change requested, it won't take effect
1301          */
1302
1303         struct thread_effective_policy next = {};
1304
1305         next.thep_qos_ui_is_urgent = task_effective.tep_qos_ui_is_urgent;
1306
1307         uint32_t next_qos = requested.thrp_qos;
1308
1309         if (requested.thrp_qos != THREAD_QOS_UNSPECIFIED) {
1310                 if (requested.thrp_qos_override != THREAD_QOS_UNSPECIFIED)
1311                         next_qos = MAX(requested.thrp_qos_override, next_qos);
1312
1313                 if (requested.thrp_qos_promote != THREAD_QOS_UNSPECIFIED)
1314                         next_qos = MAX(requested.thrp_qos_promote, next_qos);
1315
1316                 if (requested.thrp_qos_ipc_override != THREAD_QOS_UNSPECIFIED)
1317                         next_qos = MAX(requested.thrp_qos_ipc_override, next_qos);
1318         }
1319
1320         next.thep_qos = next_qos;
1321
1322         /* A task clamp will result in an effective QoS even when requested is UNSPECIFIED */
1323         if (task_effective.tep_qos_clamp != THREAD_QOS_UNSPECIFIED) {
1324                 if (next.thep_qos != THREAD_QOS_UNSPECIFIED)
1325                         next.thep_qos = MIN(task_effective.tep_qos_clamp, next.thep_qos);
1326                 else
1327                         next.thep_qos = task_effective.tep_qos_clamp;
1328         }
1329
1330         /*
1331          * Extract outbound-promotion QoS before applying task ceiling or BG clamp
1332          * This allows QoS promotions to work properly even after the process is unclamped.
1333          */
1334         next.thep_qos_promote = next.thep_qos;
1335
1336         /* The ceiling only applies to threads that are in the QoS world */
1337         if (task_effective.tep_qos_ceiling != THREAD_QOS_UNSPECIFIED &&
1338             next.thep_qos                  != THREAD_QOS_UNSPECIFIED) {
1339                 next.thep_qos = MIN(task_effective.tep_qos_ceiling, next.thep_qos);
1340         }
1341
1342         /*
1343          * The QoS relative priority is only applicable when the original programmer's
1344          * intended (requested) QoS is in effect. When the QoS is clamped (e.g.
1345          * USER_INITIATED-13REL clamped to UTILITY), the relative priority is not honored,
1346          * since otherwise it would be lower than unclamped threads. Similarly, in the
1347          * presence of boosting, the programmer doesn't know what other actors
1348          * are boosting the thread.
1349          */
1350         if ((requested.thrp_qos != THREAD_QOS_UNSPECIFIED) &&
1351             (requested.thrp_qos == next.thep_qos) &&
1352             (requested.thrp_qos_override == THREAD_QOS_UNSPECIFIED)) {
1353                 next.thep_qos_relprio = requested.thrp_qos_relprio;
1354         } else {
1355                 next.thep_qos_relprio = 0;
1356         }
1357
1358         /* Calculate DARWIN_BG */
1359         boolean_t wants_darwinbg        = FALSE;
1360         boolean_t wants_all_sockets_bg  = FALSE; /* Do I want my existing sockets to be bg */
1361
1362         /*
1363          * If DARWIN_BG has been requested at either level, it's engaged.
1364          * darwinbg threads always create bg sockets,
1365          * but only some types of darwinbg change the sockets
1366          * after they're created
1367          */
1368         if (requested.thrp_int_darwinbg || requested.thrp_ext_darwinbg)
1369                 wants_all_sockets_bg = wants_darwinbg = TRUE;
1370
1371         if (requested.thrp_pidbind_bg)
1372                 wants_all_sockets_bg = wants_darwinbg = TRUE;
1373
1374         if (task_effective.tep_darwinbg)
1375                 wants_darwinbg = TRUE;
1376
1377         if (next.thep_qos == THREAD_QOS_BACKGROUND ||
1378             next.thep_qos == THREAD_QOS_MAINTENANCE)
1379                 wants_darwinbg = TRUE;
1380
1381         /* Calculate side effects of DARWIN_BG */
1382
1383         if (wants_darwinbg)
1384                 next.thep_darwinbg = 1;
1385
1386         if (next.thep_darwinbg || task_effective.tep_new_sockets_bg)
1387                 next.thep_new_sockets_bg = 1;
1388
1389         /* Don't use task_effective.tep_all_sockets_bg here */
1390         if (wants_all_sockets_bg)
1391                 next.thep_all_sockets_bg = 1;
1392
1393         /* darwinbg implies background QOS (or lower) */
1394         if (next.thep_darwinbg &&
1395             (next.thep_qos > THREAD_QOS_BACKGROUND || next.thep_qos == THREAD_QOS_UNSPECIFIED)) {
1396                 next.thep_qos = THREAD_QOS_BACKGROUND;
1397                 next.thep_qos_relprio = 0;
1398         }
1399
1400         /* Calculate IO policy */
1401
1402         int iopol = THROTTLE_LEVEL_TIER0;
1403
1404         /* Factor in the task's IO policy */
1405         if (next.thep_darwinbg)
1406                 iopol = MAX(iopol, task_effective.tep_bg_iotier);
1407
1408         iopol = MAX(iopol, task_effective.tep_io_tier);
1409
1410         /* Look up the associated IO tier value for the QoS class */
1411         iopol = MAX(iopol, thread_qos_policy_params.qos_iotier[next.thep_qos]);
1412
1413         iopol = MAX(iopol, requested.thrp_int_iotier);
1414         iopol = MAX(iopol, requested.thrp_ext_iotier);
1415
1416         next.thep_io_tier = iopol;
1417
1418         /*
1419          * If a QoS override is causing IO to go into a lower tier, we also set
1420          * the passive bit so that a thread doesn't end up stuck in its own throttle
1421          * window when the override goes away.
1422          */
1423         boolean_t qos_io_override_active = FALSE;
1424         if (thread_qos_policy_params.qos_iotier[next.thep_qos] <
1425             thread_qos_policy_params.qos_iotier[requested.thrp_qos])
1426                 qos_io_override_active = TRUE;
1427
1428         /* Calculate Passive IO policy */
1429         if (requested.thrp_ext_iopassive    ||
1430             requested.thrp_int_iopassive    ||
1431             qos_io_override_active          ||
1432             task_effective.tep_io_passive   )
1433                 next.thep_io_passive = 1;
1434
1435         /* Calculate timer QOS */
1436         uint32_t latency_qos = requested.thrp_latency_qos;
1437
1438         latency_qos = MAX(latency_qos, task_effective.tep_latency_qos);
1439         latency_qos = MAX(latency_qos, thread_qos_policy_params.qos_latency_qos[next.thep_qos]);
1440
1441         next.thep_latency_qos = latency_qos;
1442
1443         /* Calculate throughput QOS */
1444         uint32_t through_qos = requested.thrp_through_qos;
1445
1446         through_qos = MAX(through_qos, task_effective.tep_through_qos);
1447         through_qos = MAX(through_qos, thread_qos_policy_params.qos_through_qos[next.thep_qos]);
1448
1449         next.thep_through_qos = through_qos;
1450
1451         if (task_effective.tep_terminated || requested.thrp_terminated) {
1452                 /* Shoot down the throttles that slow down exit or response to SIGTERM */
1453                 next.thep_terminated    = 1;
1454                 next.thep_darwinbg      = 0;
1455                 next.thep_io_tier       = THROTTLE_LEVEL_TIER0;
1456                 next.thep_qos           = THREAD_QOS_UNSPECIFIED;
1457                 next.thep_latency_qos   = LATENCY_QOS_TIER_UNSPECIFIED;
1458                 next.thep_through_qos   = THROUGHPUT_QOS_TIER_UNSPECIFIED;
1459         }
1460
1461         /*
1462          * Step 3:
1463          *  Swap out old policy for new policy
1464          */
1465
1466         struct thread_effective_policy prev = thread->effective_policy;
1467
1468         thread_update_qos_cpu_time_locked(thread);
1469
1470         /* This is the point where the new values become visible to other threads */
1471         thread->effective_policy = next;
1472
1473         /*
1474          * Step 4:
1475          *  Pend updates that can't be done while holding the thread lock
1476          */
1477
1478         if (prev.thep_all_sockets_bg != next.thep_all_sockets_bg)
1479                 pend_token->tpt_update_sockets = 1;
1480
1481         /* TODO: Doesn't this only need to be done if the throttle went up? */
1482         if (prev.thep_io_tier != next.thep_io_tier)
1483                 pend_token->tpt_update_throttle = 1;
1484
1485         /*
1486          * Check for the attributes that sfi_thread_classify() consults,
1487          *  and trigger SFI re-evaluation.
1488          */
1489         if (prev.thep_qos      != next.thep_qos         ||
1490             prev.thep_darwinbg != next.thep_darwinbg    )
1491                 pend_token->tpt_update_thread_sfi = 1;
1492
1493         /*
1494          * Step 5:
1495          *  Update other subsystems as necessary if something has changed
1496          */
1497
1498         /* Check for the attributes that thread_recompute_priority() consults */
1499         if (prev.thep_qos               != next.thep_qos                ||
1500             prev.thep_qos_relprio       != next.thep_qos_relprio        ||
1501             prev.thep_qos_ui_is_urgent  != next.thep_qos_ui_is_urgent   ||
1502             prev.thep_terminated        != next.thep_terminated         ||
1503             pend_token->tpt_force_recompute_pri == 1                    ||
1504             recompute_priority) {
1505                 thread_recompute_priority(thread);
1506         }
1507 }
1508
1509
1510 /*
1511  * Initiate a thread policy state transition on a thread with its TID
1512  * Useful if you cannot guarantee the thread won't get terminated
1513  * Precondition: No locks are held
1514  * Will take task lock - using the non-tid variant is faster
1515  * if you already have a thread ref.
1516  */
1517 void
1518 proc_set_thread_policy_with_tid(task_t     task,
1519                                 uint64_t   tid,
1520                                 int        category,
1521                                 int        flavor,
1522                                 int        value)
1523 {
1524         /* takes task lock, returns ref'ed thread or NULL */
1525         thread_t thread = task_findtid(task, tid);
1526
1527         if (thread == THREAD_NULL)
1528                 return;
1529
1530         proc_set_thread_policy(thread, category, flavor, value);
1531
1532         thread_deallocate(thread);
1533 }
1534
1535 /*
1536  * Initiate a thread policy transition on a thread
1537  * This path supports networking transitions (i.e. darwinbg transitions)
1538  * Precondition: No locks are held
1539  */
1540 void
1541 proc_set_thread_policy(thread_t   thread,
1542                        int        category,
1543                        int        flavor,
1544                        int        value)
1545 {
1546         struct task_pend_token pend_token = {};
1547
1548         thread_mtx_lock(thread);
1549
1550         proc_set_thread_policy_locked(thread, category, flavor, value, 0, &pend_token);
1551
1552         thread_mtx_unlock(thread);
1553
1554         thread_policy_update_complete_unlocked(thread, &pend_token);
1555 }
1556
1557 /*
1558  * KPI for pthread kext to call to set thread base QoS values during a workq wakeup
1559  * May be called with interrupts disabled and workqueue/waitqueue/kqueue locks held
1560  *
1561  * Does NOT do update completion, so the thread MUST be in a safe place WRT
1562  * IO throttling and SFI.
1563  *
1564  * TODO: Can I assert 'it must be in a safe place'?
1565  */
1566 kern_return_t
1567 thread_set_workq_qos(thread_t   thread,
1568                      int        qos_tier,
1569                      int        relprio) /* relprio is -16 to 0 */
1570 {
1571         assert(qos_tier >= 0 && qos_tier <= THREAD_QOS_LAST);
1572         assert(relprio  <= 0 && relprio  >= THREAD_QOS_MIN_TIER_IMPORTANCE);
1573
1574         if (!(qos_tier >= 0 && qos_tier <= THREAD_QOS_LAST))
1575                 return KERN_FAILURE;
1576         if (!(relprio  <= 0 && relprio  >= THREAD_QOS_MIN_TIER_IMPORTANCE))
1577                 return KERN_FAILURE;
1578
1579         if (qos_tier == THREAD_QOS_UNSPECIFIED) {
1580                 assert(relprio == 0);
1581                 if (relprio != 0)
1582                         return KERN_FAILURE;
1583         }
1584
1585         assert(thread->static_param);
1586         if (!thread->static_param) {
1587                 return KERN_FAILURE;
1588         }
1589
1590         /* Concern: this doesn't hold the mutex... */
1591         //if (!thread->active)
1592         //      return KERN_TERMINATED;
1593
1594         struct task_pend_token pend_token = {};
1595
1596         proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO, qos_tier, -relprio, &pend_token);
1597
1598         assert(pend_token.tpt_update_sockets == 0);
1599         /* we don't need to update throttle or sfi because pthread kext promises the thread is in a safe place */
1600         /* TODO: Do we need to update SFI to ensure it gets tagged with the AST? */
1601
1602         return KERN_SUCCESS;
1603 }
1604
1605
1606 /*
1607  * Do the things that can't be done while holding a thread mutex.
1608  * These are set up to call back into thread policy to get the latest value,
1609  * so they don't have to be synchronized with the update.
1610  * The only required semantic is 'call this sometime after updating effective policy'
1611  *
1612  * Precondition: Thread mutex is not held
1613  *
1614  * This may be called with the task lock held, but in that case it won't be
1615  * called with tpt_update_sockets set.
1616  */
1617 void
1618 thread_policy_update_complete_unlocked(thread_t thread, task_pend_token_t pend_token)
1619 {
1620 #ifdef MACH_BSD
1621         if (pend_token->tpt_update_sockets)
1622                 proc_apply_task_networkbg(thread->task->bsd_info, thread);
1623 #endif /* MACH_BSD */
1624
1625         if (pend_token->tpt_update_throttle)
1626                 rethrottle_thread(thread->uthread);
1627
1628         if (pend_token->tpt_update_thread_sfi)
1629                 sfi_reevaluate(thread);
1630 }
1631
1632 /*
1633  * Set and update thread policy
1634  * Thread mutex might be held
1635  */
1636 static void
1637 proc_set_thread_policy_locked(thread_t          thread,
1638                               int               category,
1639                               int               flavor,
1640                               int               value,
1641                               int               value2,
1642                               task_pend_token_t pend_token)
1643 {
1644         spl_t s = splsched();
1645         thread_lock(thread);
1646
1647         proc_set_thread_policy_spinlocked(thread, category, flavor, value, value2, pend_token);
1648
1649         thread_unlock(thread);
1650         splx(s);
1651 }
1652
1653 /*
1654  * Set and update thread policy
1655  * Thread spinlock is held
1656  */
1657 static void
1658 proc_set_thread_policy_spinlocked(thread_t          thread,
1659                                   int               category,
1660                                   int               flavor,
1661                                   int               value,
1662                                   int               value2,
1663                                   task_pend_token_t pend_token)
1664 {
1665         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1666                                   (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_THREAD))) | DBG_FUNC_START,
1667                                   thread_tid(thread), threquested_0(thread),
1668                                   threquested_1(thread), value, 0);
1669
1670         thread_set_requested_policy_spinlocked(thread, category, flavor, value, value2);
1671
1672         thread_policy_update_spinlocked(thread, FALSE, pend_token);
1673
1674         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1675                                   (IMPORTANCE_CODE(flavor, (category | TASK_POLICY_THREAD))) | DBG_FUNC_END,
1676                                   thread_tid(thread), threquested_0(thread),
1677                                   threquested_1(thread), tpending(pend_token), 0);
1678 }
1679
1680 /*
1681  * Set the requested state for a specific flavor to a specific value.
1682  */
1683 static void
1684 thread_set_requested_policy_spinlocked(thread_t     thread,
1685                                        int          category,
1686                                        int          flavor,
1687                                        int          value,
1688                                        int          value2)
1689 {
1690         int tier, passive;
1691
1692         struct thread_requested_policy requested = thread->requested_policy;
1693
1694         switch (flavor) {
1695
1696         /* Category: EXTERNAL and INTERNAL, thread and task */
1697
1698                 case TASK_POLICY_DARWIN_BG:
1699                         if (category == TASK_POLICY_EXTERNAL)
1700                                 requested.thrp_ext_darwinbg = value;
1701                         else
1702                                 requested.thrp_int_darwinbg = value;
1703                         break;
1704
1705                 case TASK_POLICY_IOPOL:
1706                         proc_iopol_to_tier(value, &tier, &passive);
1707                         if (category == TASK_POLICY_EXTERNAL) {
1708                                 requested.thrp_ext_iotier  = tier;
1709                                 requested.thrp_ext_iopassive = passive;
1710                         } else {
1711                                 requested.thrp_int_iotier  = tier;
1712                                 requested.thrp_int_iopassive = passive;
1713                         }
1714                         break;
1715
1716                 case TASK_POLICY_IO:
1717                         if (category == TASK_POLICY_EXTERNAL)
1718                                 requested.thrp_ext_iotier = value;
1719                         else
1720                                 requested.thrp_int_iotier = value;
1721                         break;
1722
1723                 case TASK_POLICY_PASSIVE_IO:
1724                         if (category == TASK_POLICY_EXTERNAL)
1725                                 requested.thrp_ext_iopassive = value;
1726                         else
1727                                 requested.thrp_int_iopassive = value;
1728                         break;
1729
1730         /* Category: ATTRIBUTE, thread only */
1731
1732                 case TASK_POLICY_PIDBIND_BG:
1733                         assert(category == TASK_POLICY_ATTRIBUTE);
1734                         requested.thrp_pidbind_bg = value;
1735                         break;
1736
1737                 case TASK_POLICY_LATENCY_QOS:
1738                         assert(category == TASK_POLICY_ATTRIBUTE);
1739                         requested.thrp_latency_qos = value;
1740                         break;
1741
1742                 case TASK_POLICY_THROUGH_QOS:
1743                         assert(category == TASK_POLICY_ATTRIBUTE);
1744                         requested.thrp_through_qos = value;
1745                         break;
1746
1747                 case TASK_POLICY_QOS:
1748                         assert(category == TASK_POLICY_ATTRIBUTE);
1749                         requested.thrp_qos = value;
1750                         break;
1751
1752                 case TASK_POLICY_QOS_OVERRIDE:
1753                         assert(category == TASK_POLICY_ATTRIBUTE);
1754                         requested.thrp_qos_override = value;
1755                         break;
1756
1757                 case TASK_POLICY_QOS_AND_RELPRIO:
1758                         assert(category == TASK_POLICY_ATTRIBUTE);
1759                         requested.thrp_qos = value;
1760                         requested.thrp_qos_relprio = value2;
1761                         DTRACE_BOOST3(qos_set, uint64_t, thread->thread_id, int, requested.thrp_qos, int, requested.thrp_qos_relprio);
1762                         break;
1763
1764                 case TASK_POLICY_QOS_PROMOTE:
1765                         assert(category == TASK_POLICY_ATTRIBUTE);
1766                         requested.thrp_qos_promote = value;
1767                         break;
1768
1769                 case TASK_POLICY_QOS_IPC_OVERRIDE:
1770                         assert(category == TASK_POLICY_ATTRIBUTE);
1771                         requested.thrp_qos_ipc_override = value;
1772                         break;
1773
1774                 case TASK_POLICY_TERMINATED:
1775                         assert(category == TASK_POLICY_ATTRIBUTE);
1776                         requested.thrp_terminated = value;
1777                         break;
1778
1779                 default:
1780                         panic("unknown task policy: %d %d %d", category, flavor, value);
1781                         break;
1782         }
1783
1784         thread->requested_policy = requested;
1785 }
1786
1787 /*
1788  * Gets what you set. Effective values may be different.
1789  * Precondition: No locks are held
1790  */
1791 int
1792 proc_get_thread_policy(thread_t   thread,
1793                        int        category,
1794                        int        flavor)
1795 {
1796         int value = 0;
1797         thread_mtx_lock(thread);
1798         value = proc_get_thread_policy_locked(thread, category, flavor, NULL);
1799         thread_mtx_unlock(thread);
1800         return value;
1801 }
1802
1803 static int
1804 proc_get_thread_policy_locked(thread_t   thread,
1805                               int        category,
1806                               int        flavor,
1807                               int*       value2)
1808 {
1809         int value = 0;
1810
1811         spl_t s = splsched();
1812         thread_lock(thread);
1813
1814         value = thread_get_requested_policy_spinlocked(thread, category, flavor, value2);
1815
1816         thread_unlock(thread);
1817         splx(s);
1818
1819         return value;
1820 }
1821
1822 /*
1823  * Gets what you set. Effective values may be different.
1824  */
1825 static int
1826 thread_get_requested_policy_spinlocked(thread_t thread,
1827                                        int      category,
1828                                        int      flavor,
1829                                        int*     value2)
1830 {
1831         int value = 0;
1832
1833         struct thread_requested_policy requested = thread->requested_policy;
1834
1835         switch (flavor) {
1836                 case TASK_POLICY_DARWIN_BG:
1837                         if (category == TASK_POLICY_EXTERNAL)
1838                                 value = requested.thrp_ext_darwinbg;
1839                         else
1840                                 value = requested.thrp_int_darwinbg;
1841                         break;
1842                 case TASK_POLICY_IOPOL:
1843                         if (category == TASK_POLICY_EXTERNAL)
1844                                 value = proc_tier_to_iopol(requested.thrp_ext_iotier,
1845                                                            requested.thrp_ext_iopassive);
1846                         else
1847                                 value = proc_tier_to_iopol(requested.thrp_int_iotier,
1848                                                            requested.thrp_int_iopassive);
1849                         break;
1850                 case TASK_POLICY_IO:
1851                         if (category == TASK_POLICY_EXTERNAL)
1852                                 value = requested.thrp_ext_iotier;
1853                         else
1854                                 value = requested.thrp_int_iotier;
1855                         break;
1856                 case TASK_POLICY_PASSIVE_IO:
1857                         if (category == TASK_POLICY_EXTERNAL)
1858                                 value = requested.thrp_ext_iopassive;
1859                         else
1860                                 value = requested.thrp_int_iopassive;
1861                         break;
1862                 case TASK_POLICY_QOS:
1863                         assert(category == TASK_POLICY_ATTRIBUTE);
1864                         value = requested.thrp_qos;
1865                         break;
1866                 case TASK_POLICY_QOS_OVERRIDE:
1867                         assert(category == TASK_POLICY_ATTRIBUTE);
1868                         value = requested.thrp_qos_override;
1869                         break;
1870                 case TASK_POLICY_LATENCY_QOS:
1871                         assert(category == TASK_POLICY_ATTRIBUTE);
1872                         value = requested.thrp_latency_qos;
1873                         break;
1874                 case TASK_POLICY_THROUGH_QOS:
1875                         assert(category == TASK_POLICY_ATTRIBUTE);
1876                         value = requested.thrp_through_qos;
1877                         break;
1878                 case TASK_POLICY_QOS_AND_RELPRIO:
1879                         assert(category == TASK_POLICY_ATTRIBUTE);
1880                         assert(value2 != NULL);
1881                         value = requested.thrp_qos;
1882                         *value2 = requested.thrp_qos_relprio;
1883                         break;
1884                 case TASK_POLICY_QOS_PROMOTE:
1885                         assert(category == TASK_POLICY_ATTRIBUTE);
1886                         value = requested.thrp_qos_promote;
1887                         break;
1888                 case TASK_POLICY_QOS_IPC_OVERRIDE:
1889                         assert(category == TASK_POLICY_ATTRIBUTE);
1890                         value = requested.thrp_qos_ipc_override;
1891                         break;
1892                 case TASK_POLICY_TERMINATED:
1893                         assert(category == TASK_POLICY_ATTRIBUTE);
1894                         value = requested.thrp_terminated;
1895                         break;
1896
1897                 default:
1898                         panic("unknown policy_flavor %d", flavor);
1899                         break;
1900         }
1901
1902         return value;
1903 }
1904
1905 /*
1906  * Gets what is actually in effect, for subsystems which pull policy instead of receive updates.
1907  *
1908  * NOTE: This accessor does not take the task or thread lock.
1909  * Notifications of state updates need to be externally synchronized with state queries.
1910  * This routine *MUST* remain interrupt safe, as it is potentially invoked
1911  * within the context of a timer interrupt.
1912  *
1913  * TODO: I think we can get away with architecting this such that we don't need to look at the task ever.
1914  *      Is that a good idea? Maybe it's best to avoid evaluate-all-the-threads updates.
1915  *      I don't think that cost is worth not having the right answer.
1916  */
1917 int
1918 proc_get_effective_thread_policy(thread_t thread,
1919                                  int      flavor)
1920 {
1921         int value = 0;
1922
1923         switch (flavor) {
1924                 case TASK_POLICY_DARWIN_BG:
1925                         /*
1926                          * This call is used within the timer layer, as well as
1927                          * prioritizing requests to the graphics system.
1928                          * It also informs SFI and originator-bg-state.
1929                          * Returns 1 for background mode, 0 for normal mode
1930                          */
1931
1932                         value = thread->effective_policy.thep_darwinbg ? 1 : 0;
1933                         break;
1934                 case TASK_POLICY_IO:
1935                         /*
1936                          * The I/O system calls here to find out what throttling tier to apply to an operation.
1937                          * Returns THROTTLE_LEVEL_* values
1938                          */
1939                         value = thread->effective_policy.thep_io_tier;
1940                         if (thread->iotier_override != THROTTLE_LEVEL_NONE)
1941                                 value = MIN(value, thread->iotier_override);
1942                         break;
1943                 case TASK_POLICY_PASSIVE_IO:
1944                         /*
1945                          * The I/O system calls here to find out whether an operation should be passive.
1946                          * (i.e. not cause operations with lower throttle tiers to be throttled)
1947                          * Returns 1 for passive mode, 0 for normal mode
1948                          *
1949                          * If an override is causing IO to go into a lower tier, we also set
1950                          * the passive bit so that a thread doesn't end up stuck in its own throttle
1951                          * window when the override goes away.
1952                          */
1953                         value = thread->effective_policy.thep_io_passive ? 1 : 0;
1954                         if (thread->iotier_override != THROTTLE_LEVEL_NONE &&
1955                             thread->iotier_override < thread->effective_policy.thep_io_tier)
1956                                 value = 1;
1957                         break;
1958                 case TASK_POLICY_ALL_SOCKETS_BG:
1959                         /*
1960                          * do_background_socket() calls this to determine whether
1961                          * it should change the thread's sockets
1962                          * Returns 1 for background mode, 0 for normal mode
1963                          * This consults both thread and task so un-DBGing a thread while the task is BG
1964                          * doesn't get you out of the network throttle.
1965                          */
1966                         value = (thread->effective_policy.thep_all_sockets_bg ||
1967                                  thread->task->effective_policy.tep_all_sockets_bg) ? 1 : 0;
1968                         break;
1969                 case TASK_POLICY_NEW_SOCKETS_BG:
1970                         /*
1971                          * socreate() calls this to determine if it should mark a new socket as background
1972                          * Returns 1 for background mode, 0 for normal mode
1973                          */
1974                         value = thread->effective_policy.thep_new_sockets_bg ? 1 : 0;
1975                         break;
1976                 case TASK_POLICY_LATENCY_QOS:
1977                         /*
1978                          * timer arming calls into here to find out the timer coalescing level
1979                          * Returns a latency QoS tier (0-6)
1980                          */
1981                         value = thread->effective_policy.thep_latency_qos;
1982                         break;
1983                 case TASK_POLICY_THROUGH_QOS:
1984                         /*
1985                          * This value is passed into the urgency callout from the scheduler
1986                          * to the performance management subsystem.
1987                          *
1988                          * Returns a throughput QoS tier (0-6)
1989                          */
1990                         value = thread->effective_policy.thep_through_qos;
1991                         break;
1992                 case TASK_POLICY_QOS:
1993                         /*
1994                          * This is communicated to the performance management layer and SFI.
1995                          *
1996                          * Returns a QoS policy tier
1997                          */
1998                         value = thread->effective_policy.thep_qos;
1999                         break;
2000                 default:
2001                         panic("unknown thread policy flavor %d", flavor);
2002                         break;
2003         }
2004
2005         return value;
2006 }
2007
2008
2009 /*
2010  * (integer_t) casts limit the number of bits we can fit here
2011  * this interface is deprecated and replaced by the _EXT struct ?
2012  */
2013 static void
2014 proc_get_thread_policy_bitfield(thread_t thread, thread_policy_state_t info)
2015 {
2016         uint64_t bits = 0;
2017         struct thread_requested_policy requested = thread->requested_policy;
2018
2019         bits |= (requested.thrp_int_darwinbg    ? POLICY_REQ_INT_DARWIN_BG  : 0);
2020         bits |= (requested.thrp_ext_darwinbg    ? POLICY_REQ_EXT_DARWIN_BG  : 0);
2021         bits |= (requested.thrp_int_iotier      ? (((uint64_t)requested.thrp_int_iotier) << POLICY_REQ_INT_IO_TIER_SHIFT) : 0);
2022         bits |= (requested.thrp_ext_iotier      ? (((uint64_t)requested.thrp_ext_iotier) << POLICY_REQ_EXT_IO_TIER_SHIFT) : 0);
2023         bits |= (requested.thrp_int_iopassive   ? POLICY_REQ_INT_PASSIVE_IO : 0);
2024         bits |= (requested.thrp_ext_iopassive   ? POLICY_REQ_EXT_PASSIVE_IO : 0);
2025
2026         bits |= (requested.thrp_qos             ? (((uint64_t)requested.thrp_qos) << POLICY_REQ_TH_QOS_SHIFT) : 0);
2027         bits |= (requested.thrp_qos_override    ? (((uint64_t)requested.thrp_qos_override) << POLICY_REQ_TH_QOS_OVER_SHIFT)   : 0);
2028
2029         bits |= (requested.thrp_pidbind_bg      ? POLICY_REQ_PIDBIND_BG     : 0);
2030
2031         bits |= (requested.thrp_latency_qos     ? (((uint64_t)requested.thrp_latency_qos) << POLICY_REQ_BASE_LATENCY_QOS_SHIFT) : 0);
2032         bits |= (requested.thrp_through_qos     ? (((uint64_t)requested.thrp_through_qos) << POLICY_REQ_BASE_THROUGH_QOS_SHIFT) : 0);
2033
2034         info->requested = (integer_t) bits;
2035         bits = 0;
2036
2037         struct thread_effective_policy effective = thread->effective_policy;
2038
2039         bits |= (effective.thep_darwinbg        ? POLICY_EFF_DARWIN_BG      : 0);
2040
2041         bits |= (effective.thep_io_tier         ? (((uint64_t)effective.thep_io_tier) << POLICY_EFF_IO_TIER_SHIFT) : 0);
2042         bits |= (effective.thep_io_passive      ? POLICY_EFF_IO_PASSIVE     : 0);
2043         bits |= (effective.thep_all_sockets_bg  ? POLICY_EFF_ALL_SOCKETS_BG : 0);
2044         bits |= (effective.thep_new_sockets_bg  ? POLICY_EFF_NEW_SOCKETS_BG : 0);
2045
2046         bits |= (effective.thep_qos             ? (((uint64_t)effective.thep_qos) << POLICY_EFF_TH_QOS_SHIFT) : 0);
2047
2048         bits |= (effective.thep_latency_qos     ? (((uint64_t)effective.thep_latency_qos) << POLICY_EFF_LATENCY_QOS_SHIFT) : 0);
2049         bits |= (effective.thep_through_qos     ? (((uint64_t)effective.thep_through_qos) << POLICY_EFF_THROUGH_QOS_SHIFT) : 0);
2050
2051         info->effective = (integer_t)bits;
2052         bits = 0;
2053
2054         info->pending = 0;
2055 }
2056
2057 /*
2058  * Sneakily trace either the task and thread requested
2059  * or just the thread requested, depending on if we have enough room.
2060  * We do have room on LP64. On LP32, we have to split it between two uintptr_t's.
2061  *
2062  *                                LP32            LP64
2063  * threquested_0(thread)          thread[0]       task[0]
2064  * threquested_1(thread)          thread[1]       thread[0]
2065  *
2066  */
2067
2068 uintptr_t
2069 threquested_0(thread_t thread)
2070 {
2071         static_assert(sizeof(struct thread_requested_policy) == sizeof(uint64_t), "size invariant violated");
2072
2073         uintptr_t* raw = (uintptr_t*)(void*)&thread->requested_policy;
2074
2075         return raw[0];
2076 }
2077
2078 uintptr_t
2079 threquested_1(thread_t thread)
2080 {
2081 #if defined __LP64__
2082         return *(uintptr_t*)&thread->task->requested_policy;
2083 #else
2084         uintptr_t* raw = (uintptr_t*)(void*)&thread->requested_policy;
2085         return raw[1];
2086 #endif
2087 }
2088
2089 uintptr_t
2090 theffective_0(thread_t thread)
2091 {
2092         static_assert(sizeof(struct thread_effective_policy) == sizeof(uint64_t), "size invariant violated");
2093
2094         uintptr_t* raw = (uintptr_t*)(void*)&thread->effective_policy;
2095         return raw[0];
2096 }
2097
2098 uintptr_t
2099 theffective_1(thread_t thread)
2100 {
2101 #if defined __LP64__
2102         return *(uintptr_t*)&thread->task->effective_policy;
2103 #else
2104         uintptr_t* raw = (uintptr_t*)(void*)&thread->effective_policy;
2105         return raw[1];
2106 #endif
2107 }
2108
2109
2110 /*
2111  * Set an override on the thread which is consulted with a
2112  * higher priority than the task/thread policy. This should
2113  * only be set for temporary grants until the thread
2114  * returns to the userspace boundary
2115  *
2116  * We use atomic operations to swap in the override, with
2117  * the assumption that the thread itself can
2118  * read the override and clear it on return to userspace.
2119  *
2120  * No locking is performed, since it is acceptable to see
2121  * a stale override for one loop through throttle_lowpri_io().
2122  * However a thread reference must be held on the thread.
2123  */
2124
2125 void set_thread_iotier_override(thread_t thread, int policy)
2126 {
2127         int current_override;
2128
2129         /* Let most aggressive I/O policy win until user boundary */
2130         do {
2131                 current_override = thread->iotier_override;
2132
2133                 if (current_override != THROTTLE_LEVEL_NONE)
2134                         policy = MIN(current_override, policy);
2135
2136                 if (current_override == policy) {
2137                         /* no effective change */
2138                         return;
2139                 }
2140         } while (!OSCompareAndSwap(current_override, policy, &thread->iotier_override));
2141
2142         /*
2143          * Since the thread may be currently throttled,
2144          * re-evaluate tiers and potentially break out
2145          * of an msleep
2146          */
2147         rethrottle_thread(thread->uthread);
2148 }
2149
2150 /*
2151  * Userspace synchronization routines (like pthread mutexes, pthread reader-writer locks,
2152  * semaphores, dispatch_sync) may result in priority inversions where a higher priority
2153  * (i.e. scheduler priority, I/O tier, QoS tier) is waiting on a resource owned by a lower
2154  * priority thread. In these cases, we attempt to propagate the priority token, as long
2155  * as the subsystem informs us of the relationships between the threads. The userspace
2156  * synchronization subsystem should maintain the information of owner->resource and
2157  * resource->waiters itself.
2158  */
2159
2160 /*
2161  * This helper canonicalizes the resource/resource_type given the current qos_override_mode
2162  * in effect. Note that wildcards (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD) may need
2163  * to be handled specially in the future, but for now it's fine to slam
2164  * *resource to USER_ADDR_NULL even if it was previously a wildcard.
2165  */
2166 static void canonicalize_resource_and_type(user_addr_t *resource, int *resource_type) {
2167         if (qos_override_mode == QOS_OVERRIDE_MODE_OVERHANG_PEAK || qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) {
2168                 /* Map all input resource/type to a single one */
2169                 *resource = USER_ADDR_NULL;
2170                 *resource_type = THREAD_QOS_OVERRIDE_TYPE_UNKNOWN;
2171         } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE) {
2172                 /* no transform */
2173         } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH) {
2174                 /* Map all dispatch overrides to a single one, to avoid memory overhead */
2175                 if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) {
2176                         *resource = USER_ADDR_NULL;
2177                 }
2178         } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE) {
2179                 /* Map all mutex overrides to a single one, to avoid memory overhead */
2180                 if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX) {
2181                         *resource = USER_ADDR_NULL;
2182                 }
2183         }
2184 }
2185
2186 /* This helper routine finds an existing override if known. Locking should be done by caller */
2187 static struct thread_qos_override *
2188 find_qos_override(thread_t thread,
2189                   user_addr_t resource,
2190                   int resource_type)
2191 {
2192         struct thread_qos_override *override;
2193
2194         override = thread->overrides;
2195         while (override) {
2196                 if (override->override_resource == resource &&
2197                     override->override_resource_type == resource_type) {
2198                         return override;
2199                 }
2200
2201                 override = override->override_next;
2202         }
2203
2204         return NULL;
2205 }
2206
2207 static void
2208 find_and_decrement_qos_override(thread_t       thread,
2209                                 user_addr_t    resource,
2210                                 int            resource_type,
2211                                 boolean_t      reset,
2212                                 struct thread_qos_override **free_override_list)
2213 {
2214         struct thread_qos_override *override, *override_prev;
2215
2216         override_prev = NULL;
2217         override = thread->overrides;
2218         while (override) {
2219                 struct thread_qos_override *override_next = override->override_next;
2220
2221                 if ((THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD == resource  || override->override_resource == resource) &&
2222                     (THREAD_QOS_OVERRIDE_TYPE_WILDCARD == resource_type || override->override_resource_type == resource_type)) {
2223
2224                         if (reset) {
2225                                 override->override_contended_resource_count = 0;
2226                         } else {
2227                                 override->override_contended_resource_count--;
2228                         }
2229
2230                         if (override->override_contended_resource_count == 0) {
2231                                 if (override_prev == NULL) {
2232                                         thread->overrides = override_next;
2233                                 } else {
2234                                         override_prev->override_next = override_next;
2235                                 }
2236
2237                                 /* Add to out-param for later zfree */
2238                                 override->override_next = *free_override_list;
2239                                 *free_override_list = override;
2240                         } else {
2241                                 override_prev = override;
2242                         }
2243
2244                         if (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD != resource) {
2245                                 return;
2246                         }
2247                 } else {
2248                         override_prev = override;
2249                 }
2250
2251                 override = override_next;
2252         }
2253 }
2254
2255 /* This helper recalculates the current requested override using the policy selected at boot */
2256 static int
2257 calculate_requested_qos_override(thread_t thread)
2258 {
2259         if (qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) {
2260                 return THREAD_QOS_UNSPECIFIED;
2261         }
2262
2263         /* iterate over all overrides and calculate MAX */
2264         struct thread_qos_override *override;
2265         int qos_override = THREAD_QOS_UNSPECIFIED;
2266
2267         override = thread->overrides;
2268         while (override) {
2269                 if (qos_override_mode != QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH ||
2270                         override->override_resource_type != THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) {
2271                         qos_override = MAX(qos_override, override->override_qos);
2272                 }
2273
2274                 override = override->override_next;
2275         }
2276
2277         return qos_override;
2278 }
2279
2280 /*
2281  * Returns:
2282  * - 0 on success
2283  * - EINVAL if some invalid input was passed
2284  * - EFAULT if user_lock_addr != NULL and needs to be faulted (userland has to
2285  *   fault and retry)
2286  * - ESTALE if user_lock_addr != NULL &&
2287  *   ulock_owner_value_to_port_name(*user_lock_addr) != user_lock_owner
2288  */
2289 static int
2290 proc_thread_qos_add_override_internal(thread_t         thread,
2291                                       int              override_qos,
2292                                       boolean_t        first_override_for_resource,
2293                                       user_addr_t      resource,
2294                                       int              resource_type,
2295                                       user_addr_t      user_lock_addr,
2296                                       mach_port_name_t user_lock_owner)
2297 {
2298         struct task_pend_token pend_token = {};
2299         int rc = 0;
2300
2301         thread_mtx_lock(thread);
2302
2303         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_START,
2304                                                   thread_tid(thread), override_qos, first_override_for_resource ? 1 : 0, 0, 0);
2305
2306         DTRACE_BOOST5(qos_add_override_pre, uint64_t, thread_tid(thread),
2307                         uint64_t, thread->requested_policy.thrp_qos,
2308                         uint64_t, thread->effective_policy.thep_qos,
2309                         int, override_qos, boolean_t, first_override_for_resource);
2310
2311         struct thread_qos_override *override;
2312         struct thread_qos_override *override_new = NULL;
2313         int new_qos_override, prev_qos_override;
2314         int new_effective_qos;
2315
2316         canonicalize_resource_and_type(&resource, &resource_type);
2317
2318         override = find_qos_override(thread, resource, resource_type);
2319         if (first_override_for_resource && !override) {
2320                 /* We need to allocate a new object. Drop the thread lock and
2321                  * recheck afterwards in case someone else added the override
2322                  */
2323                 thread_mtx_unlock(thread);
2324                 override_new = zalloc(thread_qos_override_zone);
2325                 thread_mtx_lock(thread);
2326                 override = find_qos_override(thread, resource, resource_type);
2327         }
2328         if (user_lock_addr) {
2329                 uint64_t val;
2330                 /* Workaround lack of explicit support for 'no-fault copyin'
2331                  * <rdar://problem/24999882>, as disabling preemption prevents paging in
2332                  */
2333                 disable_preemption();
2334                 rc = copyin_word(user_lock_addr, &val, sizeof(user_lock_owner));
2335                 enable_preemption();
2336                 if (rc == 0 && ulock_owner_value_to_port_name((uint32_t)val) != user_lock_owner) {
2337                         rc = ESTALE;
2338                 }
2339                 if (rc) {
2340                         prev_qos_override = proc_get_thread_policy_locked(thread,
2341                                         TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL);
2342                         new_qos_override = prev_qos_override;
2343                         new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
2344                         thread_mtx_unlock(thread);
2345                         goto out;
2346                 }
2347         }
2348         if (first_override_for_resource && override) {
2349                 /* Someone else already allocated while the thread lock was dropped */
2350                 override->override_contended_resource_count++;
2351         } else if (!override && override_new) {
2352                 override = override_new;
2353                 override_new = NULL;
2354                 override->override_next = thread->overrides;
2355                 /* since first_override_for_resource was TRUE */
2356                 override->override_contended_resource_count = 1;
2357                 override->override_resource = resource;
2358                 override->override_resource_type = resource_type;
2359                 override->override_qos = THREAD_QOS_UNSPECIFIED;
2360                 thread->overrides = override;
2361         }
2362
2363         if (override) {
2364                 if (override->override_qos == THREAD_QOS_UNSPECIFIED)
2365                         override->override_qos = override_qos;
2366                 else
2367                         override->override_qos = MAX(override->override_qos, override_qos);
2368         }
2369
2370         /* Determine how to combine the various overrides into a single current
2371          * requested override
2372          */
2373         new_qos_override = calculate_requested_qos_override(thread);
2374
2375         prev_qos_override = proc_get_thread_policy_locked(thread,
2376                         TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL);
2377
2378         if (new_qos_override != prev_qos_override) {
2379                 proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
2380                                               TASK_POLICY_QOS_OVERRIDE,
2381                                               new_qos_override, 0, &pend_token);
2382         }
2383
2384         new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
2385
2386         thread_mtx_unlock(thread);
2387
2388         thread_policy_update_complete_unlocked(thread, &pend_token);
2389
2390 out:
2391         if (override_new) {
2392                 zfree(thread_qos_override_zone, override_new);
2393         }
2394
2395         DTRACE_BOOST4(qos_add_override_post, int, prev_qos_override,
2396                       int, new_qos_override, int, new_effective_qos, int, rc);
2397
2398         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_END,
2399                                                   new_qos_override, resource, resource_type, 0, 0);
2400
2401         return rc;
2402 }
2403
2404 int
2405 proc_thread_qos_add_override_check_owner(thread_t thread,
2406                                          int override_qos,
2407                                          boolean_t first_override_for_resource,
2408                                          user_addr_t resource,
2409                                          int resource_type,
2410                                          user_addr_t user_lock_addr,
2411                                          mach_port_name_t user_lock_owner)
2412 {
2413         return proc_thread_qos_add_override_internal(thread, override_qos,
2414                         first_override_for_resource, resource, resource_type,
2415                         user_lock_addr, user_lock_owner);
2416 }
2417
2418 boolean_t
2419 proc_thread_qos_add_override(task_t           task,
2420                              thread_t         thread,
2421                              uint64_t         tid,
2422                              int              override_qos,
2423                              boolean_t        first_override_for_resource,
2424                              user_addr_t      resource,
2425                              int              resource_type)
2426 {
2427         boolean_t has_thread_reference = FALSE;
2428         int rc = 0;
2429
2430         if (thread == THREAD_NULL) {
2431                 thread = task_findtid(task, tid);
2432                 /* returns referenced thread */
2433
2434                 if (thread == THREAD_NULL) {
2435                         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_NONE,
2436                                                                   tid, 0, 0xdead, 0, 0);
2437                         return FALSE;
2438                 }
2439                 has_thread_reference = TRUE;
2440         } else {
2441                 assert(thread->task == task);
2442         }
2443         rc = proc_thread_qos_add_override_internal(thread, override_qos,
2444                         first_override_for_resource, resource, resource_type, 0, 0);
2445         if (has_thread_reference) {
2446                 thread_deallocate(thread);
2447         }
2448
2449         return rc == 0;
2450 }
2451
2452 static int
2453 proc_thread_qos_remove_override_internal(thread_t       thread,
2454                                          user_addr_t    resource,
2455                                          int            resource_type,
2456                                          boolean_t      reset,
2457                                          boolean_t      squash)
2458 {
2459         struct task_pend_token pend_token = {};
2460
2461         struct thread_qos_override *deferred_free_override_list = NULL;
2462         int new_qos_override, prev_qos_override, new_effective_qos, prev_qos;
2463         int new_qos = THREAD_QOS_UNSPECIFIED;
2464
2465         thread_mtx_lock(thread);
2466
2467         canonicalize_resource_and_type(&resource, &resource_type);
2468
2469         find_and_decrement_qos_override(thread, resource, resource_type, reset, &deferred_free_override_list);
2470
2471         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_START,
2472                               thread_tid(thread), resource, reset, 0, 0);
2473
2474         DTRACE_BOOST3(qos_remove_override_pre, uint64_t, thread_tid(thread),
2475                         uint64_t, thread->requested_policy.thrp_qos,
2476                         uint64_t, thread->effective_policy.thep_qos);
2477
2478         /* Determine how to combine the various overrides into a single current requested override */
2479         new_qos_override = calculate_requested_qos_override(thread);
2480
2481         spl_t s = splsched();
2482         thread_lock(thread);
2483
2484         /*
2485          * The override chain and therefore the value of the current override is locked with thread mutex,
2486          * so we can do a get/set without races.  However, the rest of thread policy is locked under the spinlock.
2487          * This means you can't change the current override from a spinlock-only setter.
2488          */
2489         prev_qos_override = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL);
2490
2491         if (squash) {
2492                 /*
2493                  * Remove the specified overrides, and set the current override as the new base QoS.
2494                  * Return the new QoS value.
2495                  */
2496                 prev_qos = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, NULL);
2497
2498                 new_qos = MAX(prev_qos, prev_qos_override);
2499                 if (new_qos != prev_qos)
2500                         proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, new_qos, 0, &pend_token);
2501         }
2502
2503         if (new_qos_override != prev_qos_override)
2504                 proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, new_qos_override, 0, &pend_token);
2505
2506         new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
2507
2508         thread_unlock(thread);
2509         splx(s);
2510
2511         thread_mtx_unlock(thread);
2512
2513         thread_policy_update_complete_unlocked(thread, &pend_token);
2514
2515         while (deferred_free_override_list) {
2516                 struct thread_qos_override *override_next = deferred_free_override_list->override_next;
2517
2518                 zfree(thread_qos_override_zone, deferred_free_override_list);
2519                 deferred_free_override_list = override_next;
2520         }
2521
2522         DTRACE_BOOST3(qos_remove_override_post, int, prev_qos_override,
2523                       int, new_qos_override, int, new_effective_qos);
2524
2525         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_END,
2526                               thread_tid(thread), squash, 0, 0, 0);
2527
2528         return new_qos;
2529 }
2530
2531 boolean_t
2532 proc_thread_qos_remove_override(task_t      task,
2533                                 thread_t    thread,
2534                                 uint64_t    tid,
2535                                 user_addr_t resource,
2536                                 int         resource_type)
2537 {
2538         boolean_t has_thread_reference = FALSE;
2539
2540         if (thread == THREAD_NULL) {
2541                 thread = task_findtid(task, tid);
2542                 /* returns referenced thread */
2543
2544                 if (thread == THREAD_NULL) {
2545                         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE,
2546                                               tid, 0, 0xdead, 0, 0);
2547                         return FALSE;
2548                 }
2549                 has_thread_reference = TRUE;
2550         } else {
2551                 assert(task == thread->task);
2552         }
2553
2554         proc_thread_qos_remove_override_internal(thread, resource, resource_type, FALSE, FALSE);
2555
2556         if (has_thread_reference)
2557                 thread_deallocate(thread);
2558
2559         return TRUE;
2560 }
2561
2562 boolean_t
2563 proc_thread_qos_reset_override(task_t       task,
2564                                thread_t     thread,
2565                                uint64_t     tid,
2566                                user_addr_t  resource,
2567                                int          resource_type)
2568
2569 {
2570         boolean_t has_thread_reference = FALSE;
2571
2572         if (thread == THREAD_NULL) {
2573                 thread = task_findtid(task, tid);
2574                 /* returns referenced thread */
2575
2576                 if (thread == THREAD_NULL) {
2577                         KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE,
2578                                               tid, 0, 0xdead, 0, 0);
2579                         return FALSE;
2580                 }
2581                 has_thread_reference = TRUE;
2582         } else {
2583                 assert(task == thread->task);
2584         }
2585
2586         proc_thread_qos_remove_override_internal(thread, resource, resource_type, TRUE, FALSE);
2587
2588         if (has_thread_reference)
2589                 thread_deallocate(thread);
2590
2591         return TRUE;
2592 }
2593
2594 /*
2595  * Clears the requested overrides, and replaces the current QoS with the max
2596  * of the current QoS and the current override, then returns the new QoS.
2597  *
2598  * This is useful in order to reset overrides before parking a workqueue thread,
2599  * but avoid dropping priority and getting preempted right before parking.
2600  *
2601  * Called without any locks held.
2602  */
2603 int
2604 proc_thread_qos_squash_override(thread_t thread, user_addr_t resource, int resource_type)
2605 {
2606         return proc_thread_qos_remove_override_internal(thread, resource, resource_type, TRUE, TRUE);
2607 }
2608
2609 /* Deallocate before thread termination */
2610 void proc_thread_qos_deallocate(thread_t thread)
2611 {
2612         /*
2613          * There are no more references to this thread,
2614          * therefore this thread must not own any more locks,
2615          * therefore there must not be any more user promotions.
2616          */
2617         assert(thread->user_promotions == 0);
2618         assert(thread->requested_policy.thrp_qos_promote == THREAD_QOS_UNSPECIFIED);
2619         assert(thread->user_promotion_basepri == 0);
2620
2621         /* This thread must have no more IPC overrides. */
2622         assert(thread->ipc_overrides == 0);
2623         assert(thread->requested_policy.thrp_qos_ipc_override == THREAD_QOS_UNSPECIFIED);
2624
2625         /*
2626          * Clear out any lingering override objects.
2627          */
2628         struct thread_qos_override *override;
2629
2630         thread_mtx_lock(thread);
2631         override = thread->overrides;
2632         thread->overrides = NULL;
2633         thread->requested_policy.thrp_qos_override = THREAD_QOS_UNSPECIFIED;
2634         /* We don't need to re-evaluate thread policy here because the thread has already exited */
2635         thread_mtx_unlock(thread);
2636
2637         while (override) {
2638                 struct thread_qos_override *override_next = override->override_next;
2639
2640                 zfree(thread_qos_override_zone, override);
2641                 override = override_next;
2642         }
2643 }
2644
2645 /*
2646  * Set up the primordial thread's QoS
2647  */
2648 void
2649 task_set_main_thread_qos(task_t task, thread_t thread) {
2650         struct task_pend_token pend_token = {};
2651
2652         assert(thread->task == task);
2653
2654         thread_mtx_lock(thread);
2655
2656         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2657                                   (IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, 0)) | DBG_FUNC_START,
2658                                   thread_tid(thread), threquested_0(thread), threquested_1(thread),
2659                                   thread->requested_policy.thrp_qos, 0);
2660
2661         int primordial_qos = task_compute_main_thread_qos(task);
2662
2663         proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS,
2664                                       primordial_qos, 0, &pend_token);
2665
2666         thread_mtx_unlock(thread);
2667
2668         thread_policy_update_complete_unlocked(thread, &pend_token);
2669
2670         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2671                                   (IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, 0)) | DBG_FUNC_END,
2672                                   thread_tid(thread), threquested_0(thread), threquested_1(thread),
2673                                   primordial_qos, 0);
2674 }
2675
2676 /*
2677  * KPI for pthread kext
2678  *
2679  * Return a good guess at what the initial manager QoS will be
2680  * Dispatch can override this in userspace if it so chooses
2681  */
2682 int
2683 task_get_default_manager_qos(task_t task)
2684 {
2685         int primordial_qos = task_compute_main_thread_qos(task);
2686
2687         if (primordial_qos == THREAD_QOS_LEGACY)
2688                 primordial_qos = THREAD_QOS_USER_INITIATED;
2689
2690         return primordial_qos;
2691 }
2692
2693
2694 /*
2695  * Promote thread with the user level properties of 'promoter'
2696  * Mutexes may be held, but it's OK to take the throttle lock
2697  *
2698  * if 'new_promotion' is TRUE, this is a new promotion.
2699  * if FALSE, we are updating an existing promotion.
2700  */
2701 static void
2702 thread_user_promotion_promote(thread_t  thread,
2703                               thread_t  promoter,
2704                               struct promote_token* promote_token,
2705                               boolean_t new_promotion)
2706 {
2707         struct task_pend_token pend_token = {};
2708
2709         uint32_t promoter_base_pri = 0, promoter_qos = THREAD_QOS_UNSPECIFIED;
2710
2711         spl_t s = splsched();
2712         thread_lock(promoter);
2713
2714         /*
2715          * We capture the 'promotion qos' here, which is captured
2716          * before task-level clamping.
2717          *
2718          * This means that if the process gets unclamped while a promotion,
2719          * is in effect, the owning thread ends up with the correct QoS.
2720          *
2721          * This does NOT work correctly across processes, as the correct QoS
2722          * in one is not necessarily the correct QoS in another.
2723          * When we add support for multi-process ulock boosting, we need to
2724          * do something more complex.
2725          */
2726         promoter_qos = promoter->effective_policy.thep_qos_promote;
2727
2728         /* TODO: extract 'effective unclamped base pri' instead */
2729         promoter_base_pri = promoter->base_pri;
2730
2731         thread_unlock(promoter);
2732         splx(s);
2733
2734         /* clamp out realtime to max user pri */
2735         promoter_base_pri = MIN(promoter_base_pri, MAXPRI_USER);
2736
2737         /* add in the saved promotion token */
2738         assert(promote_token->pt_basepri <= MAXPRI_USER);
2739
2740         promoter_base_pri = MAX(promoter_base_pri, promote_token->pt_basepri);
2741         promoter_qos = MAX(promoter_qos, promote_token->pt_qos);
2742
2743         /* save the max for later */
2744         promote_token->pt_basepri = promoter_base_pri;
2745         promote_token->pt_qos = promoter_qos;
2746
2747         s = splsched();
2748         thread_lock(thread);
2749
2750         if (new_promotion) {
2751                 if (thread->user_promotions == 0) {
2752                         assert(thread->requested_policy.thrp_qos_promote == THREAD_QOS_UNSPECIFIED);
2753                         assert(thread->user_promotion_basepri == 0);
2754                 }
2755
2756                 thread->user_promotions++;
2757         } else {
2758                 assert(thread->user_promotions > 0);
2759         }
2760
2761         uint32_t thread_qos     = thread->requested_policy.thrp_qos_promote;
2762         uint32_t thread_basepri = thread->user_promotion_basepri;
2763
2764         uint32_t new_qos     = MAX(thread_qos, promoter_qos);
2765         uint32_t new_basepri = MAX(thread_basepri, promoter_base_pri);
2766
2767         /* TODO: Fast path the 'new is lower than effective' case to avoid full reevaluation */
2768         if (thread_qos != new_qos || thread_basepri != new_basepri) {
2769
2770                 thread->user_promotion_basepri = new_basepri;
2771
2772                 pend_token.tpt_force_recompute_pri = 1;
2773
2774                 proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
2775                                                   TASK_POLICY_QOS_PROMOTE, new_qos,
2776                                                   0, &pend_token);
2777         }
2778
2779         thread_unlock(thread);
2780         splx(s);
2781
2782         thread_policy_update_complete_unlocked(thread, &pend_token);
2783 }
2784
2785 /* Add a user promotion to thread */
2786 void
2787 thread_user_promotion_add(thread_t thread,
2788                           thread_t promoter,
2789                           struct promote_token* promote_token)
2790 {
2791         thread_user_promotion_promote(thread, promoter, promote_token, TRUE);
2792 }
2793
2794 /* Update an existing user promotion on thread */
2795 void
2796 thread_user_promotion_update(thread_t thread,
2797                              thread_t promoter,
2798                              struct promote_token* promote_token)
2799 {
2800         thread_user_promotion_promote(thread, promoter, promote_token, FALSE);
2801 }
2802
2803 /*
2804  * Drop a user promotion on thread
2805  * Mutexes may be held, but it's OK to take the throttle lock
2806  */
2807 void
2808 thread_user_promotion_drop(thread_t thread)
2809 {
2810         struct task_pend_token pend_token = {};
2811
2812         spl_t s = splsched();
2813         thread_lock(thread);
2814
2815         assert(thread->user_promotions > 0);
2816
2817         if (--thread->user_promotions == 0) {
2818                 thread->requested_policy.thrp_qos_promote = THREAD_QOS_UNSPECIFIED;
2819                 thread->user_promotion_basepri = 0;
2820
2821                 pend_token.tpt_force_recompute_pri = 1;
2822
2823                 proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
2824                                                   TASK_POLICY_QOS_PROMOTE, THREAD_QOS_UNSPECIFIED,
2825                                                   0, &pend_token);
2826         }
2827
2828         thread_unlock(thread);
2829         splx(s);
2830
2831         thread_policy_update_complete_unlocked(thread, &pend_token);
2832 }
2833
2834
2835 /*
2836  * Set the thread's QoS IPC override
2837  * Owned by the IPC subsystem
2838  *
2839  * May be called with spinlocks held, but not spinlocks
2840  * that may deadlock against the thread lock, the throttle lock, or the SFI lock.
2841  *
2842  * One 'add' must be balanced by one 'drop'.
2843  * Between 'add' and 'drop', the overide QoS value may be updated with an 'update'.
2844  * Before the thread is deallocated, there must be 0 remaining overrides.
2845  */
2846 static void
2847 thread_ipc_override(thread_t    thread,
2848                     uint32_t    qos_override,
2849                     boolean_t   is_new_override)
2850 {
2851         struct task_pend_token pend_token = {};
2852
2853         spl_t s = splsched();
2854         thread_lock(thread);
2855
2856         uint32_t old_override = thread->requested_policy.thrp_qos_ipc_override;
2857
2858         if (is_new_override) {
2859                 if (thread->ipc_overrides++ == 0) {
2860                         /* This add is the first override for this thread */
2861                         assert(old_override == THREAD_QOS_UNSPECIFIED);
2862                 } else {
2863                         /* There are already other overrides in effect for this thread */
2864                         assert(old_override > THREAD_QOS_UNSPECIFIED);
2865                 }
2866         } else {
2867                 /* There must be at least one override (the previous add call) in effect */
2868                 assert(thread->ipc_overrides > 0);
2869                 assert(old_override > THREAD_QOS_UNSPECIFIED);
2870         }
2871
2872         uint32_t new_override = MAX(old_override, qos_override);
2873
2874         proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
2875                                           TASK_POLICY_QOS_IPC_OVERRIDE,
2876                                           new_override, 0, &pend_token);
2877
2878         assert(pend_token.tpt_update_sockets == 0);
2879
2880         thread_unlock(thread);
2881         splx(s);
2882
2883         /*
2884          * this is only safe after rethrottle_thread supports
2885          * being called from spinlock context
2886          */
2887         thread_policy_update_complete_unlocked(thread, &pend_token);
2888 }
2889
2890 void
2891 thread_add_ipc_override(thread_t    thread,
2892                         uint32_t    qos_override)
2893 {
2894         thread_ipc_override(thread, qos_override, TRUE);
2895 }
2896
2897 void
2898 thread_update_ipc_override(thread_t     thread,
2899                            uint32_t     qos_override)
2900 {
2901         thread_ipc_override(thread, qos_override, FALSE);
2902 }
2903
2904 void
2905 thread_drop_ipc_override(thread_t thread)
2906 {
2907         struct task_pend_token pend_token = {};
2908
2909         spl_t s = splsched();
2910         thread_lock(thread);
2911
2912         assert(thread->ipc_overrides > 0);
2913
2914         if (--thread->ipc_overrides == 0) {
2915                 /*
2916                  * There are no more overrides for this thread, so we should
2917                  * clear out the saturated override value
2918                  */
2919
2920                 proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
2921                                                   TASK_POLICY_QOS_IPC_OVERRIDE, THREAD_QOS_UNSPECIFIED,
2922                                                   0, &pend_token);
2923         }
2924
2925         thread_unlock(thread);
2926         splx(s);
2927
2928         /*
2929          * this is only safe after rethrottle_thread supports
2930          * being called from spinlock context
2931          */
2932         thread_policy_update_complete_unlocked(thread, &pend_token);
2933 }
2934
2935 /* Get current IPC override, may be called from spinlock context */
2936 uint32_t
2937 thread_get_ipc_override(thread_t thread)
2938 {
2939         return proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_IPC_OVERRIDE, NULL);
2940 }
2941