osfmk/kern/sched_prim.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_FREE_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   sched_prim.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *      Date:   1986
  62  *
  63  *      Scheduling primitives
  64  *
  65  */
  66
  67 #include <debug.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/machine.h>
  71 #include <mach/policy.h>
  72 #include <mach/sync_policy.h>
  73 #include <mach/thread_act.h>
  74
  75 #include <machine/machine_routines.h>
  76 #include <machine/sched_param.h>
  77 #include <machine/machine_cpu.h>
  78 #include <machine/machlimits.h>
  79
  80 #ifdef CONFIG_MACH_APPROXIMATE_TIME
  81 #include <machine/commpage.h>
  82 #endif
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/clock.h>
  86 #include <kern/counters.h>
  87 #include <kern/cpu_number.h>
  88 #include <kern/cpu_data.h>
  89 #include <kern/debug.h>
  90 #include <kern/macro_help.h>
  91 #include <kern/machine.h>
  92 #include <kern/misc_protos.h>
  93 #include <kern/processor.h>
  94 #include <kern/queue.h>
  95 #include <kern/sched.h>
  96 #include <kern/sched_prim.h>
  97 #include <kern/sfi.h>
  98 #include <kern/syscall_subr.h>
  99 #include <kern/task.h>
 100 #include <kern/thread.h>
 101 #include <kern/wait_queue.h>
 102 #include <kern/ledger.h>
 103 #include <kern/timer_queue.h>
 104
 105 #include <vm/pmap.h>
 106 #include <vm/vm_kern.h>
 107 #include <vm/vm_map.h>
 108
 109 #include <mach/sdt.h>
 110
 111 #include <sys/kdebug.h>
 112
 113 #include <kern/pms.h>
 114
 115 #if defined(CONFIG_TELEMETRY) && defined(CONFIG_SCHED_TIMESHARE_CORE)
 116 #include <kern/telemetry.h>
 117 #endif
 118
 119 struct rt_queue rt_runq;
 120 #define RT_RUNQ         ((processor_t)-1)
 121 decl_simple_lock_data(static,rt_lock);
 122
 123 #if defined(CONFIG_SCHED_FAIRSHARE_CORE)
 124 static struct fairshare_queue   fs_runq;
 125 #define FS_RUNQ         ((processor_t)-2)
 126 decl_simple_lock_data(static,fs_lock);
 127 #endif /* CONFIG_SCHED_FAIRSHARE_CORE */
 128
 129 #define         DEFAULT_PREEMPTION_RATE         100             /* (1/s) */
 130 int                     default_preemption_rate = DEFAULT_PREEMPTION_RATE;
 131
 132 #define         DEFAULT_BG_PREEMPTION_RATE      400             /* (1/s) */
 133 int                     default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
 134
 135 #define         MAX_UNSAFE_QUANTA                       800
 136 int                     max_unsafe_quanta = MAX_UNSAFE_QUANTA;
 137
 138 #define         MAX_POLL_QUANTA                         2
 139 int                     max_poll_quanta = MAX_POLL_QUANTA;
 140
 141 #define         SCHED_POLL_YIELD_SHIFT          4               /* 1/16 */
 142 int                     sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
 143
 144 uint64_t        max_poll_computation;
 145
 146 uint64_t        max_unsafe_computation;
 147 uint64_t        sched_safe_duration;
 148
 149 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 150
 151 uint32_t        std_quantum;
 152 uint32_t        min_std_quantum;
 153 uint32_t        bg_quantum;
 154
 155 uint32_t        std_quantum_us;
 156 uint32_t        bg_quantum_us;
 157
 158 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 159
 160 uint32_t        thread_depress_time;
 161 uint32_t        default_timeshare_computation;
 162 uint32_t        default_timeshare_constraint;
 163
 164 uint32_t        max_rt_quantum;
 165 uint32_t        min_rt_quantum;
 166
 167 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 168
 169 unsigned        sched_tick;
 170 uint32_t        sched_tick_interval;
 171 #if defined(CONFIG_TELEMETRY)
 172 uint32_t        sched_telemetry_interval;
 173 #endif /* CONFIG_TELEMETRY */
 174
 175 uint32_t        sched_pri_shift = INT8_MAX;
 176 uint32_t        sched_background_pri_shift = INT8_MAX;
 177 uint32_t        sched_combined_fgbg_pri_shift = INT8_MAX;
 178 uint32_t        sched_fixed_shift;
 179 uint32_t        sched_use_combined_fgbg_decay = 0;
 180
 181 uint32_t        sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
 182
 183 /* Allow foreground to decay past default to resolve inversions */
 184 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
 185 int             sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
 186
 187 /* Defaults for timer deadline profiling */
 188 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
 189                                                         * 2ms */
 190 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
 191                                                           <= 5ms */
 192
 193 uint64_t timer_deadline_tracking_bin_1;
 194 uint64_t timer_deadline_tracking_bin_2;
 195
 196 thread_t sched_maintenance_thread;
 197
 198 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 199
 200 #if defined(CONFIG_SCHED_TRADITIONAL)
 201
 202 static boolean_t sched_traditional_use_pset_runqueue = FALSE;
 203
 204 __attribute__((always_inline))
 205 static inline run_queue_t runq_for_processor(processor_t processor)
 206 {
 207         if (sched_traditional_use_pset_runqueue)
 208                 return &processor->processor_set->pset_runq;
 209         else
 210                 return &processor->runq;
 211 }
 212
 213 __attribute__((always_inline))
 214 static inline void runq_consider_incr_bound_count(processor_t processor, thread_t thread)
 215 {
 216         if (thread->bound_processor == PROCESSOR_NULL)
 217                 return;
 218
 219         assert(thread->bound_processor == processor);
 220
 221         if (sched_traditional_use_pset_runqueue)
 222                 processor->processor_set->pset_runq_bound_count++;
 223
 224         processor->runq_bound_count++;
 225 }
 226
 227 __attribute__((always_inline))
 228 static inline void runq_consider_decr_bound_count(processor_t processor, thread_t thread)
 229 {
 230         if (thread->bound_processor == PROCESSOR_NULL)
 231                 return;
 232
 233         assert(thread->bound_processor == processor);
 234
 235         if (sched_traditional_use_pset_runqueue)
 236                 processor->processor_set->pset_runq_bound_count--;
 237
 238         processor->runq_bound_count--;
 239 }
 240
 241 #endif /* CONFIG_SCHED_TRADITIONAL */
 242
 243 uint64_t        sched_one_second_interval;
 244
 245 uint32_t        sched_run_count, sched_share_count, sched_background_count;
 246 uint32_t        sched_load_average, sched_mach_factor;
 247
 248 /* Forwards */
 249
 250 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 251
 252 static void load_shift_init(void);
 253 static void preempt_pri_init(void);
 254
 255 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 256
 257 static thread_t thread_select(
 258                                         thread_t                        thread,
 259                                         processor_t                     processor,
 260                                         ast_t                           reason);
 261
 262 #if CONFIG_SCHED_IDLE_IN_PLACE
 263 static thread_t thread_select_idle(
 264                                         thread_t                        thread,
 265                                         processor_t                     processor);
 266 #endif
 267
 268 thread_t        processor_idle(
 269                                         thread_t                        thread,
 270                                         processor_t                     processor);
 271
 272 ast_t
 273 csw_check_locked(       processor_t             processor,
 274                                         processor_set_t pset,
 275                                         ast_t                   check_reason);
 276
 277 #if defined(CONFIG_SCHED_TRADITIONAL)
 278
 279 static thread_t steal_thread(
 280                                         processor_set_t         pset);
 281
 282 static thread_t steal_thread_disabled(
 283                                         processor_set_t         pset) __attribute__((unused));
 284
 285
 286 static thread_t steal_processor_thread(
 287                                         processor_t                     processor);
 288
 289 static void             thread_update_scan(void);
 290
 291 static void processor_setrun(
 292                                  processor_t                    processor,
 293                                  thread_t                       thread,
 294                                  integer_t                      options);
 295
 296 static boolean_t
 297 processor_enqueue(
 298                                   processor_t           processor,
 299                                   thread_t              thread,
 300                                   integer_t             options);
 301
 302 static boolean_t
 303 processor_queue_remove(
 304                                            processor_t                  processor,
 305                                            thread_t             thread);
 306
 307 static boolean_t        processor_queue_empty(processor_t               processor);
 308
 309 static ast_t            processor_csw_check(processor_t processor);
 310
 311 static boolean_t        processor_queue_has_priority(processor_t                processor,
 312                                                                                         int                             priority,
 313                                                                                         boolean_t               gte);
 314
 315 static boolean_t        should_current_thread_rechoose_processor(processor_t                    processor);
 316
 317 static int     sched_traditional_processor_runq_count(processor_t   processor);
 318
 319 static boolean_t        sched_traditional_with_pset_runqueue_processor_queue_empty(processor_t          processor);
 320
 321 static uint64_t     sched_traditional_processor_runq_stats_count_sum(processor_t   processor);
 322
 323 static uint64_t         sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum(processor_t   processor);
 324
 325 static int      sched_traditional_processor_bound_count(processor_t processor);
 326
 327 #endif
 328
 329
 330 #if defined(CONFIG_SCHED_TRADITIONAL)
 331
 332 static void
 333 sched_traditional_processor_init(processor_t processor);
 334
 335 static void
 336 sched_traditional_pset_init(processor_set_t pset);
 337
 338 static void
 339 sched_traditional_with_pset_runqueue_init(void);
 340
 341 #endif
 342
 343 static void
 344 sched_realtime_init(void);
 345
 346 static void
 347 sched_realtime_timebase_init(void);
 348
 349 static void
 350 sched_timer_deadline_tracking_init(void);
 351
 352 #if defined(CONFIG_SCHED_TRADITIONAL)
 353
 354 static sched_mode_t
 355 sched_traditional_initial_thread_sched_mode(task_t parent_task);
 356
 357 static thread_t
 358 sched_traditional_choose_thread(
 359                                 processor_t     processor,
 360                                 int             priority,
 361                        __unused ast_t           reason);
 362
 363 #endif
 364
 365 #if     DEBUG
 366 extern int debug_task;
 367 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
 368 #else
 369 #define TLOG(a, fmt, args...) do {} while (0)
 370 #endif
 371
 372 __assert_only static
 373 boolean_t       thread_runnable(
 374                                 thread_t                thread);
 375
 376 /*
 377  *      State machine
 378  *
 379  * states are combinations of:
 380  *  R   running
 381  *  W   waiting (or on wait queue)
 382  *  N   non-interruptible
 383  *  O   swapped out
 384  *  I   being swapped in
 385  *
 386  * init action
 387  *      assert_wait thread_block    clear_wait          swapout swapin
 388  *
 389  * R    RW, RWN     R;   setrun     -                   -
 390  * RN   RWN         RN;  setrun     -                   -
 391  *
 392  * RW               W               R                   -
 393  * RWN              WN              RN                  -
 394  *
 395  * W                                R;   setrun         WO
 396  * WN                               RN;  setrun         -
 397  *
 398  * RO                               -                   -       R
 399  *
 400  */
 401
 402 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 403 int8_t          sched_load_shifts[NRQS];
 404 int             sched_preempt_pri[NRQBM];
 405 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 406
 407
 408 #if defined(CONFIG_SCHED_TRADITIONAL)
 409
 410 const struct sched_dispatch_table sched_traditional_dispatch = {
 411         .init                                           = sched_traditional_init,
 412         .timebase_init                                  = sched_traditional_timebase_init,
 413         .processor_init                                 = sched_traditional_processor_init,
 414         .pset_init                                      = sched_traditional_pset_init,
 415         .maintenance_continuation                       = sched_traditional_maintenance_continue,
 416         .choose_thread                                  = sched_traditional_choose_thread,
 417         .steal_thread                                   = steal_thread,
 418         .compute_priority                               = compute_priority,
 419         .choose_processor                               = choose_processor,
 420         .processor_enqueue                              = processor_enqueue,
 421         .processor_queue_shutdown                       = processor_queue_shutdown,
 422         .processor_queue_remove                         = processor_queue_remove,
 423         .processor_queue_empty                          = processor_queue_empty,
 424         .priority_is_urgent                             = priority_is_urgent,
 425         .processor_csw_check                            = processor_csw_check,
 426         .processor_queue_has_priority                   = processor_queue_has_priority,
 427         .initial_quantum_size                           = sched_traditional_initial_quantum_size,
 428         .initial_thread_sched_mode                      = sched_traditional_initial_thread_sched_mode,
 429         .can_update_priority                            = can_update_priority,
 430         .update_priority                                = update_priority,
 431         .lightweight_update_priority                    = lightweight_update_priority,
 432         .quantum_expire                                 = sched_traditional_quantum_expire,
 433         .should_current_thread_rechoose_processor       = should_current_thread_rechoose_processor,
 434         .processor_runq_count                           = sched_traditional_processor_runq_count,
 435         .processor_runq_stats_count_sum                 = sched_traditional_processor_runq_stats_count_sum,
 436         .fairshare_init                                 = sched_traditional_fairshare_init,
 437         .fairshare_runq_count                           = sched_traditional_fairshare_runq_count,
 438         .fairshare_runq_stats_count_sum                 = sched_traditional_fairshare_runq_stats_count_sum,
 439         .fairshare_enqueue                              = sched_traditional_fairshare_enqueue,
 440         .fairshare_dequeue                              = sched_traditional_fairshare_dequeue,
 441         .fairshare_queue_remove                         = sched_traditional_fairshare_queue_remove,
 442         .processor_bound_count                          = sched_traditional_processor_bound_count,
 443         .thread_update_scan                             = thread_update_scan,
 444         .direct_dispatch_to_idle_processors             = TRUE,
 445 };
 446
 447 const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch = {
 448         .init                                           = sched_traditional_with_pset_runqueue_init,
 449         .timebase_init                                  = sched_traditional_timebase_init,
 450         .processor_init                                 = sched_traditional_processor_init,
 451         .pset_init                                      = sched_traditional_pset_init,
 452         .maintenance_continuation                       = sched_traditional_maintenance_continue,
 453         .choose_thread                                  = sched_traditional_choose_thread,
 454         .steal_thread                                   = steal_thread,
 455         .compute_priority                               = compute_priority,
 456         .choose_processor                               = choose_processor,
 457         .processor_enqueue                              = processor_enqueue,
 458         .processor_queue_shutdown                       = processor_queue_shutdown,
 459         .processor_queue_remove                         = processor_queue_remove,
 460         .processor_queue_empty                          = sched_traditional_with_pset_runqueue_processor_queue_empty,
 461         .priority_is_urgent                             = priority_is_urgent,
 462         .processor_csw_check                            = processor_csw_check,
 463         .processor_queue_has_priority                   = processor_queue_has_priority,
 464         .initial_quantum_size                           = sched_traditional_initial_quantum_size,
 465         .initial_thread_sched_mode                      = sched_traditional_initial_thread_sched_mode,
 466         .can_update_priority                            = can_update_priority,
 467         .update_priority                                = update_priority,
 468         .lightweight_update_priority                    = lightweight_update_priority,
 469         .quantum_expire                                 = sched_traditional_quantum_expire,
 470         .should_current_thread_rechoose_processor       = should_current_thread_rechoose_processor,
 471         .processor_runq_count                           = sched_traditional_processor_runq_count,
 472         .processor_runq_stats_count_sum                 = sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum,
 473         .fairshare_init                                 = sched_traditional_fairshare_init,
 474         .fairshare_runq_count                           = sched_traditional_fairshare_runq_count,
 475         .fairshare_runq_stats_count_sum                 = sched_traditional_fairshare_runq_stats_count_sum,
 476         .fairshare_enqueue                              = sched_traditional_fairshare_enqueue,
 477         .fairshare_dequeue                              = sched_traditional_fairshare_dequeue,
 478         .fairshare_queue_remove                         = sched_traditional_fairshare_queue_remove,
 479         .processor_bound_count                          = sched_traditional_processor_bound_count,
 480         .thread_update_scan                             = thread_update_scan,
 481         .direct_dispatch_to_idle_processors             = FALSE,
 482 };
 483
 484 #endif
 485
 486 const struct sched_dispatch_table *sched_current_dispatch = NULL;
 487
 488 /*
 489  * Statically allocate a buffer to hold the longest possible
 490  * scheduler description string, as currently implemented.
 491  * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
 492  * to export to userspace via sysctl(3). If either version
 493  * changes, update the other.
 494  *
 495  * Note that in addition to being an upper bound on the strings
 496  * in the kernel, it's also an exact parameter to PE_get_default(),
 497  * which interrogates the device tree on some platforms. That
 498  * API requires the caller know the exact size of the device tree
 499  * property, so we need both a legacy size (32) and the current size
 500  * (48) to deal with old and new device trees. The device tree property
 501  * is similarly padded to a fixed size so that the same kernel image
 502  * can run on multiple devices with different schedulers configured
 503  * in the device tree.
 504  */
 505 #define SCHED_STRING_MAX_LENGTH (48)
 506
 507 char sched_string[SCHED_STRING_MAX_LENGTH];
 508 static enum sched_enum _sched_enum __attribute__((used)) = sched_enum_unknown;
 509
 510 /* Global flag which indicates whether Background Stepper Context is enabled */
 511 static int cpu_throttle_enabled = 1;
 512
 513 void
 514 sched_init(void)
 515 {
 516         char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
 517
 518         /* Check for runtime selection of the scheduler algorithm */
 519         if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
 520                 /* If no boot-args override, look in device tree */
 521                 if (!PE_get_default("kern.sched", sched_arg,
 522                                                         SCHED_STRING_MAX_LENGTH)) {
 523                         sched_arg[0] = '\0';
 524                 }
 525         }
 526
 527
 528         if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
 529                 /* No boot-args, check in device tree */
 530                 if (!PE_get_default("kern.sched_pri_decay_limit",
 531                                                         &sched_pri_decay_band_limit,
 532                                                         sizeof(sched_pri_decay_band_limit))) {
 533                         /* Allow decay all the way to normal limits */
 534                         sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
 535                 }
 536         }
 537
 538         kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
 539
 540         if (strlen(sched_arg) > 0) {
 541                 if (0) {
 542                         /* Allow pattern below */
 543 #if defined(CONFIG_SCHED_TRADITIONAL)
 544                 } else if (0 == strcmp(sched_arg, kSchedTraditionalString)) {
 545                         sched_current_dispatch = &sched_traditional_dispatch;
 546                         _sched_enum = sched_enum_traditional;
 547                         strlcpy(sched_string, kSchedTraditionalString, sizeof(sched_string));
 548                 } else if (0 == strcmp(sched_arg, kSchedTraditionalWithPsetRunqueueString)) {
 549                         sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
 550                         _sched_enum = sched_enum_traditional_with_pset_runqueue;
 551                         strlcpy(sched_string, kSchedTraditionalWithPsetRunqueueString, sizeof(sched_string));
 552 #endif
 553 #if defined(CONFIG_SCHED_PROTO)
 554                 } else if (0 == strcmp(sched_arg, kSchedProtoString)) {
 555                         sched_current_dispatch = &sched_proto_dispatch;
 556                         _sched_enum = sched_enum_proto;
 557                         strlcpy(sched_string, kSchedProtoString, sizeof(sched_string));
 558 #endif
 559 #if defined(CONFIG_SCHED_GRRR)
 560                 } else if (0 == strcmp(sched_arg, kSchedGRRRString)) {
 561                         sched_current_dispatch = &sched_grrr_dispatch;
 562                         _sched_enum = sched_enum_grrr;
 563                         strlcpy(sched_string, kSchedGRRRString, sizeof(sched_string));
 564 #endif
 565 #if defined(CONFIG_SCHED_MULTIQ)
 566                 } else if (0 == strcmp(sched_arg, kSchedMultiQString)) {
 567                         sched_current_dispatch = &sched_multiq_dispatch;
 568                         _sched_enum = sched_enum_multiq;
 569                         strlcpy(sched_string, kSchedMultiQString, sizeof(sched_string));
 570                 } else if (0 == strcmp(sched_arg, kSchedDualQString)) {
 571                         sched_current_dispatch = &sched_dualq_dispatch;
 572                         _sched_enum = sched_enum_dualq;
 573                         strlcpy(sched_string, kSchedDualQString, sizeof(sched_string));
 574 #endif
 575                 } else {
 576 #if defined(CONFIG_SCHED_TRADITIONAL)
 577                         printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
 578                         printf("Scheduler: Using instead: %s\n", kSchedTraditionalWithPsetRunqueueString);
 579
 580                         sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
 581                         _sched_enum = sched_enum_traditional_with_pset_runqueue;
 582                         strlcpy(sched_string, kSchedTraditionalWithPsetRunqueueString, sizeof(sched_string));
 583 #else
 584                         panic("Unrecognized scheduler algorithm: %s", sched_arg);
 585 #endif
 586                 }
 587                 kprintf("Scheduler: Runtime selection of %s\n", sched_string);
 588         } else {
 589 #if   defined(CONFIG_SCHED_MULTIQ)
 590                 sched_current_dispatch = &sched_multiq_dispatch;
 591                 _sched_enum = sched_enum_multiq;
 592                 strlcpy(sched_string, kSchedMultiQString, sizeof(sched_string));
 593 #elif defined(CONFIG_SCHED_TRADITIONAL)
 594                 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
 595                 _sched_enum = sched_enum_traditional_with_pset_runqueue;
 596                 strlcpy(sched_string, kSchedTraditionalWithPsetRunqueueString, sizeof(sched_string));
 597 #elif defined(CONFIG_SCHED_PROTO)
 598                 sched_current_dispatch = &sched_proto_dispatch;
 599                 _sched_enum = sched_enum_proto;
 600                 strlcpy(sched_string, kSchedProtoString, sizeof(sched_string));
 601 #elif defined(CONFIG_SCHED_GRRR)
 602                 sched_current_dispatch = &sched_grrr_dispatch;
 603                 _sched_enum = sched_enum_grrr;
 604                 strlcpy(sched_string, kSchedGRRRString, sizeof(sched_string));
 605 #else
 606 #error No default scheduler implementation
 607 #endif
 608                 kprintf("Scheduler: Default of %s\n", sched_string);
 609         }
 610
 611         SCHED(init)();
 612         SCHED(fairshare_init)();
 613         sched_realtime_init();
 614         ast_init();
 615         sched_timer_deadline_tracking_init();
 616
 617         SCHED(pset_init)(&pset0);
 618         SCHED(processor_init)(master_processor);
 619 }
 620
 621 void
 622 sched_timebase_init(void)
 623 {
 624         uint64_t        abstime;
 625
 626         clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
 627         sched_one_second_interval = abstime;
 628
 629         SCHED(timebase_init)();
 630         sched_realtime_timebase_init();
 631 }
 632
 633 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 634
 635 void
 636 sched_traditional_init(void)
 637 {
 638         /*
 639          * Calculate the timeslicing quantum
 640          * in us.
 641          */
 642         if (default_preemption_rate < 1)
 643                 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
 644         std_quantum_us = (1000 * 1000) / default_preemption_rate;
 645
 646         printf("standard timeslicing quantum is %d us\n", std_quantum_us);
 647
 648         if (default_bg_preemption_rate < 1)
 649                 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
 650         bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
 651
 652         printf("standard background quantum is %d us\n", bg_quantum_us);
 653
 654         load_shift_init();
 655         preempt_pri_init();
 656         sched_tick = 0;
 657 }
 658
 659 void
 660 sched_traditional_timebase_init(void)
 661 {
 662         uint64_t        abstime;
 663         uint32_t        shift;
 664
 665         /* standard timeslicing quantum */
 666         clock_interval_to_absolutetime_interval(
 667                                                         std_quantum_us, NSEC_PER_USEC, &abstime);
 668         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 669         std_quantum = (uint32_t)abstime;
 670
 671         /* smallest remaining quantum (250 us) */
 672         clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
 673         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 674         min_std_quantum = (uint32_t)abstime;
 675
 676         /* quantum for background tasks */
 677         clock_interval_to_absolutetime_interval(
 678                                                         bg_quantum_us, NSEC_PER_USEC, &abstime);
 679         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 680         bg_quantum = (uint32_t)abstime;
 681
 682         /* scheduler tick interval */
 683         clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
 684                                                                                                         NSEC_PER_USEC, &abstime);
 685         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 686         sched_tick_interval = (uint32_t)abstime;
 687
 688         /*
 689          * Compute conversion factor from usage to
 690          * timesharing priorities with 5/8 ** n aging.
 691          */
 692         abstime = (abstime * 5) / 3;
 693         for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
 694                 abstime >>= 1;
 695         sched_fixed_shift = shift;
 696
 697         max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
 698         sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
 699
 700         max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
 701         thread_depress_time = 1 * std_quantum;
 702         default_timeshare_computation = std_quantum / 2;
 703         default_timeshare_constraint = std_quantum;
 704
 705 #if defined(CONFIG_TELEMETRY)
 706         /* interval for high frequency telemetry */
 707         clock_interval_to_absolutetime_interval(10, NSEC_PER_MSEC, &abstime);
 708         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 709         sched_telemetry_interval = (uint32_t)abstime;
 710 #endif
 711 }
 712
 713 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 714
 715
 716 #if defined(CONFIG_SCHED_TRADITIONAL)
 717
 718 static void
 719 sched_traditional_processor_init(processor_t processor)
 720 {
 721         if (!sched_traditional_use_pset_runqueue) {
 722                 run_queue_init(&processor->runq);
 723         }
 724         processor->runq_bound_count = 0;
 725 }
 726
 727 static void
 728 sched_traditional_pset_init(processor_set_t pset)
 729 {
 730         if (sched_traditional_use_pset_runqueue) {
 731                 run_queue_init(&pset->pset_runq);
 732         }
 733         pset->pset_runq_bound_count = 0;
 734 }
 735
 736 static void
 737 sched_traditional_with_pset_runqueue_init(void)
 738 {
 739         sched_traditional_init();
 740         sched_traditional_use_pset_runqueue = TRUE;
 741 }
 742
 743 #endif /* CONFIG_SCHED_TRADITIONAL */
 744
 745 #if defined(CONFIG_SCHED_FAIRSHARE_CORE)
 746 void
 747 sched_traditional_fairshare_init(void)
 748 {
 749         simple_lock_init(&fs_lock, 0);
 750
 751         fs_runq.count = 0;
 752         queue_init(&fs_runq.queue);
 753 }
 754 #endif /* CONFIG_SCHED_FAIRSHARE_CORE */
 755
 756 static void
 757 sched_realtime_init(void)
 758 {
 759         simple_lock_init(&rt_lock, 0);
 760
 761         rt_runq.count = 0;
 762         queue_init(&rt_runq.queue);
 763 }
 764
 765 static void
 766 sched_realtime_timebase_init(void)
 767 {
 768         uint64_t abstime;
 769
 770         /* smallest rt computaton (50 us) */
 771         clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
 772         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 773         min_rt_quantum = (uint32_t)abstime;
 774
 775         /* maximum rt computation (50 ms) */
 776         clock_interval_to_absolutetime_interval(
 777                 50, 1000*NSEC_PER_USEC, &abstime);
 778         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 779         max_rt_quantum = (uint32_t)abstime;
 780
 781 }
 782
 783 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 784
 785 /*
 786  * Set up values for timeshare
 787  * loading factors.
 788  */
 789 static void
 790 load_shift_init(void)
 791 {
 792         int8_t          k, *p = sched_load_shifts;
 793         uint32_t        i, j;
 794
 795         uint32_t        sched_decay_penalty = 1;
 796
 797         if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
 798                 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
 799         }
 800
 801         if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
 802                 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
 803         }
 804
 805         if (PE_parse_boot_argn("sched_use_combined_fgbg_decay", &sched_use_combined_fgbg_decay, sizeof (sched_use_combined_fgbg_decay))) {
 806                 kprintf("Overriding schedule fg/bg decay calculation: %u\n", sched_use_combined_fgbg_decay);
 807         }
 808
 809         if (sched_decay_penalty == 0) {
 810                 /*
 811                  * There is no penalty for timeshare threads for using too much
 812                  * CPU, so set all load shifts to INT8_MIN. Even under high load,
 813                  * sched_pri_shift will be >INT8_MAX, and there will be no
 814                  * penalty applied to threads (nor will sched_usage be updated per
 815                  * thread).
 816                  */
 817                 for (i = 0; i < NRQS; i++) {
 818                         sched_load_shifts[i] = INT8_MIN;
 819                 }
 820
 821                 return;
 822         }
 823
 824         *p++ = INT8_MIN; *p++ = 0;
 825
 826         /*
 827          * For a given system load "i", the per-thread priority
 828          * penalty per quantum of CPU usage is ~2^k priority
 829          * levels. "sched_decay_penalty" can cause more
 830          * array entries to be filled with smaller "k" values
 831          */
 832         for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
 833                 for (j <<= 1; (i < j) && (i < NRQS); ++i)
 834                         *p++ = k;
 835         }
 836 }
 837
 838 static void
 839 preempt_pri_init(void)
 840 {
 841         int             i, *p = sched_preempt_pri;
 842
 843         for (i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
 844                 setbit(i, p);
 845
 846         for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
 847                 setbit(i, p);
 848 }
 849
 850 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 851
 852 /*
 853  *      Thread wait timer expiration.
 854  */
 855 void
 856 thread_timer_expire(
 857         void                    *p0,
 858         __unused void   *p1)
 859 {
 860         thread_t                thread = p0;
 861         spl_t                   s;
 862
 863         s = splsched();
 864         thread_lock(thread);
 865         if (--thread->wait_timer_active == 0) {
 866                 if (thread->wait_timer_is_set) {
 867                         thread->wait_timer_is_set = FALSE;
 868                         clear_wait_internal(thread, THREAD_TIMED_OUT);
 869                 }
 870         }
 871         thread_unlock(thread);
 872         splx(s);
 873 }
 874
 875 /*
 876  *      thread_unblock:
 877  *
 878  *      Unblock thread on wake up.
 879  *
 880  *      Returns TRUE if the thread is still running.
 881  *
 882  *      Thread must be locked.
 883  */
 884 boolean_t
 885 thread_unblock(
 886         thread_t                thread,
 887         wait_result_t   wresult)
 888 {
 889         boolean_t               result = FALSE;
 890         thread_t                cthread = current_thread();
 891         uint32_t                new_run_count;
 892
 893         /*
 894          *      Set wait_result.
 895          */
 896         thread->wait_result = wresult;
 897
 898         /*
 899          *      Cancel pending wait timer.
 900          */
 901         if (thread->wait_timer_is_set) {
 902                 if (timer_call_cancel(&thread->wait_timer))
 903                         thread->wait_timer_active--;
 904                 thread->wait_timer_is_set = FALSE;
 905         }
 906
 907         /*
 908          *      Update scheduling state: not waiting,
 909          *      set running.
 910          */
 911         thread->state &= ~(TH_WAIT|TH_UNINT);
 912
 913         if (!(thread->state & TH_RUN)) {
 914                 thread->state |= TH_RUN;
 915
 916                 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
 917
 918                 /*
 919                  *      Update run counts.
 920                  */
 921                 new_run_count = sched_run_incr(thread);
 922                 if (thread->sched_mode == TH_MODE_TIMESHARE) {
 923                         sched_share_incr(thread);
 924
 925                         if (thread->sched_flags & TH_SFLAG_THROTTLED)
 926                                 sched_background_incr(thread);
 927                 }
 928         }
 929         else {
 930                 /*
 931                  *      Signal if idling on another processor.
 932                  */
 933 #if CONFIG_SCHED_IDLE_IN_PLACE
 934                 if (thread->state & TH_IDLE) {
 935                         processor_t             processor = thread->last_processor;
 936
 937                         if (processor != current_processor())
 938                                 machine_signal_idle(processor);
 939                 }
 940 #else
 941                 assert((thread->state & TH_IDLE) == 0);
 942 #endif
 943
 944                 new_run_count = sched_run_count; /* updated in thread_select_idle() */
 945                 result = TRUE;
 946         }
 947
 948         /*
 949          * Calculate deadline for real-time threads.
 950          */
 951         if (thread->sched_mode == TH_MODE_REALTIME) {
 952                 uint64_t                ctime;
 953
 954                 ctime = mach_absolute_time();
 955                 thread->realtime.deadline = thread->realtime.constraint + ctime;
 956         }
 957
 958         /*
 959          * Clear old quantum, fail-safe computation, etc.
 960          */
 961         thread->quantum_remaining = 0;
 962         thread->computation_metered = 0;
 963         thread->reason = AST_NONE;
 964
 965         /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
 966          * We also account for "double hop" thread signaling via
 967          * the thread callout infrastructure.
 968          * DRK: consider removing the callout wakeup counters in the future
 969          * they're present for verification at the moment.
 970          */
 971         boolean_t aticontext, pidle;
 972         ml_get_power_state(&aticontext, &pidle);
 973
 974         if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
 975                 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
 976                 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
 977
 978                 uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
 979
 980                 if (ttd) {
 981                         if (ttd <= timer_deadline_tracking_bin_1)
 982                                 thread->thread_timer_wakeups_bin_1++;
 983                         else
 984                                 if (ttd <= timer_deadline_tracking_bin_2)
 985                                         thread->thread_timer_wakeups_bin_2++;
 986                 }
 987
 988                 if (pidle) {
 989                         ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
 990                 }
 991
 992         } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
 993                 if (cthread->callout_woken_from_icontext) {
 994                         ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
 995                         thread->thread_callout_interrupt_wakeups++;
 996                         if (cthread->callout_woken_from_platform_idle) {
 997                                 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
 998                                 thread->thread_callout_platform_idle_wakeups++;
 999                         }
1000
1001                         cthread->callout_woke_thread = TRUE;
1002                 }
1003         }
1004
1005         if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
1006                 thread->callout_woken_from_icontext = aticontext;
1007                 thread->callout_woken_from_platform_idle = pidle;
1008                 thread->callout_woke_thread = FALSE;
1009         }
1010
1011         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1012                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
1013                 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, new_run_count, 0);
1014
1015         DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
1016
1017         return (result);
1018 }
1019
1020 /*
1021  *      Routine:        thread_go
1022  *      Purpose:
1023  *              Unblock and dispatch thread.
1024  *      Conditions:
1025  *              thread lock held, IPC locks may be held.
1026  *              thread must have been pulled from wait queue under same lock hold.
1027  *  Returns:
1028  *              KERN_SUCCESS - Thread was set running
1029  *              KERN_NOT_WAITING - Thread was not waiting
1030  */
1031 kern_return_t
1032 thread_go(
1033         thread_t                thread,
1034         wait_result_t   wresult)
1035 {
1036         assert(thread->at_safe_point == FALSE);
1037         assert(thread->wait_event == NO_EVENT64);
1038         assert(thread->wait_queue == WAIT_QUEUE_NULL);
1039
1040         if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT) {
1041                 if (!thread_unblock(thread, wresult))
1042                         thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1043
1044                 return (KERN_SUCCESS);
1045         }
1046
1047         return (KERN_NOT_WAITING);
1048 }
1049
1050 /*
1051  *      Routine:        thread_mark_wait_locked
1052  *      Purpose:
1053  *              Mark a thread as waiting.  If, given the circumstances,
1054  *              it doesn't want to wait (i.e. already aborted), then
1055  *              indicate that in the return value.
1056  *      Conditions:
1057  *              at splsched() and thread is locked.
1058  */
1059 __private_extern__
1060 wait_result_t
1061 thread_mark_wait_locked(
1062         thread_t                        thread,
1063         wait_interrupt_t        interruptible)
1064 {
1065         boolean_t               at_safe_point;
1066
1067         assert(thread == current_thread());
1068
1069         /*
1070          *      The thread may have certain types of interrupts/aborts masked
1071          *      off.  Even if the wait location says these types of interrupts
1072          *      are OK, we have to honor mask settings (outer-scoped code may
1073          *      not be able to handle aborts at the moment).
1074          */
1075         if (interruptible > (thread->options & TH_OPT_INTMASK))
1076                 interruptible = thread->options & TH_OPT_INTMASK;
1077
1078         at_safe_point = (interruptible == THREAD_ABORTSAFE);
1079
1080         if (    interruptible == THREAD_UNINT                   ||
1081                         !(thread->sched_flags & TH_SFLAG_ABORT) ||
1082                         (!at_safe_point &&
1083                                 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1084
1085                 if ( !(thread->state & TH_TERMINATE))
1086                         DTRACE_SCHED(sleep);
1087
1088                 thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
1089                 thread->at_safe_point = at_safe_point;
1090                 return (thread->wait_result = THREAD_WAITING);
1091         }
1092         else
1093         if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
1094                 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1095
1096         return (thread->wait_result = THREAD_INTERRUPTED);
1097 }
1098
1099 /*
1100  *      Routine:        thread_interrupt_level
1101  *      Purpose:
1102  *              Set the maximum interruptible state for the
1103  *              current thread.  The effective value of any
1104  *              interruptible flag passed into assert_wait
1105  *              will never exceed this.
1106  *
1107  *              Useful for code that must not be interrupted,
1108  *              but which calls code that doesn't know that.
1109  *      Returns:
1110  *              The old interrupt level for the thread.
1111  */
1112 __private_extern__
1113 wait_interrupt_t
1114 thread_interrupt_level(
1115         wait_interrupt_t new_level)
1116 {
1117         thread_t thread = current_thread();
1118         wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1119
1120         thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1121
1122         return result;
1123 }
1124
1125 /*
1126  * Check to see if an assert wait is possible, without actually doing one.
1127  * This is used by debug code in locks and elsewhere to verify that it is
1128  * always OK to block when trying to take a blocking lock (since waiting
1129  * for the actual assert_wait to catch the case may make it hard to detect
1130  * this case.
1131  */
1132 boolean_t
1133 assert_wait_possible(void)
1134 {
1135
1136         thread_t thread;
1137
1138 #if     DEBUG
1139         if(debug_mode) return TRUE;             /* Always succeed in debug mode */
1140 #endif
1141
1142         thread = current_thread();
1143
1144         return (thread == NULL || wait_queue_assert_possible(thread));
1145 }
1146
1147 /*
1148  *      assert_wait:
1149  *
1150  *      Assert that the current thread is about to go to
1151  *      sleep until the specified event occurs.
1152  */
1153 wait_result_t
1154 assert_wait(
1155         event_t                         event,
1156         wait_interrupt_t        interruptible)
1157 {
1158         register wait_queue_t   wq;
1159         register int            index;
1160
1161         if(event == NO_EVENT)
1162                 panic("assert_wait() called with NO_EVENT");
1163
1164         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1165                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1166                 VM_KERNEL_UNSLIDE(event), 0, 0, 0, 0);
1167
1168         index = wait_hash(event);
1169         wq = &wait_queues[index];
1170         return wait_queue_assert_wait(wq, event, interruptible, 0);
1171 }
1172
1173 wait_result_t
1174 assert_wait_timeout(
1175         event_t                         event,
1176         wait_interrupt_t        interruptible,
1177         uint32_t                        interval,
1178         uint32_t                        scale_factor)
1179 {
1180         thread_t                        thread = current_thread();
1181         wait_result_t           wresult;
1182         wait_queue_t            wqueue;
1183         uint64_t                        deadline;
1184         spl_t                           s;
1185
1186         if(event == NO_EVENT)
1187                 panic("assert_wait_timeout() called with NO_EVENT");
1188
1189         wqueue = &wait_queues[wait_hash(event)];
1190
1191         s = splsched();
1192         wait_queue_lock(wqueue);
1193         thread_lock(thread);
1194
1195         clock_interval_to_deadline(interval, scale_factor, &deadline);
1196
1197         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1198                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1199                 VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1200
1201         wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t, event),
1202                                                   interruptible,
1203                                                   TIMEOUT_URGENCY_SYS_NORMAL,
1204                                                   deadline, 0,
1205                                                   thread);
1206
1207         thread_unlock(thread);
1208         wait_queue_unlock(wqueue);
1209         splx(s);
1210
1211         return (wresult);
1212 }
1213
1214 wait_result_t
1215 assert_wait_timeout_with_leeway(
1216         event_t                         event,
1217         wait_interrupt_t        interruptible,
1218         wait_timeout_urgency_t  urgency,
1219         uint32_t                        interval,
1220         uint32_t                        leeway,
1221         uint32_t                        scale_factor)
1222 {
1223         thread_t                        thread = current_thread();
1224         wait_result_t           wresult;
1225         wait_queue_t            wqueue;
1226         uint64_t                        deadline;
1227         uint64_t                        abstime;
1228         uint64_t                        slop;
1229         uint64_t                        now;
1230         spl_t                           s;
1231
1232         now = mach_absolute_time();
1233         clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1234         deadline = now + abstime;
1235
1236         clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1237
1238         if(event == NO_EVENT)
1239                 panic("assert_wait_timeout_with_leeway() called with NO_EVENT");
1240
1241         wqueue = &wait_queues[wait_hash(event)];
1242
1243         s = splsched();
1244         wait_queue_lock(wqueue);
1245         thread_lock(thread);
1246
1247         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1248                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1249                 VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1250
1251         wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t, event),
1252                                                   interruptible,
1253                                                   urgency, deadline, slop,
1254                                                   thread);
1255
1256         thread_unlock(thread);
1257         wait_queue_unlock(wqueue);
1258         splx(s);
1259
1260         return (wresult);
1261 }
1262
1263 wait_result_t
1264 assert_wait_deadline(
1265         event_t                         event,
1266         wait_interrupt_t        interruptible,
1267         uint64_t                        deadline)
1268 {
1269         thread_t                        thread = current_thread();
1270         wait_result_t           wresult;
1271         wait_queue_t            wqueue;
1272         spl_t                           s;
1273
1274         assert(event != NO_EVENT);
1275         wqueue = &wait_queues[wait_hash(event)];
1276
1277         s = splsched();
1278         wait_queue_lock(wqueue);
1279         thread_lock(thread);
1280
1281         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1282                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1283                 VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1284
1285         wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t,event),
1286                                                   interruptible,
1287                                                   TIMEOUT_URGENCY_SYS_NORMAL, deadline, 0,
1288                                                   thread);
1289
1290         thread_unlock(thread);
1291         wait_queue_unlock(wqueue);
1292         splx(s);
1293
1294         return (wresult);
1295 }
1296
1297 wait_result_t
1298 assert_wait_deadline_with_leeway(
1299         event_t                         event,
1300         wait_interrupt_t        interruptible,
1301         wait_timeout_urgency_t  urgency,
1302         uint64_t                        deadline,
1303         uint64_t                        leeway)
1304 {
1305         thread_t                        thread = current_thread();
1306         wait_result_t           wresult;
1307         wait_queue_t            wqueue;
1308         spl_t                           s;
1309
1310         if(event == NO_EVENT)
1311                 panic("assert_wait_deadline_with_leeway() called with NO_EVENT");
1312
1313         wqueue = &wait_queues[wait_hash(event)];
1314
1315         s = splsched();
1316         wait_queue_lock(wqueue);
1317         thread_lock(thread);
1318
1319         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1320                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1321                 VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1322
1323         wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t,event),
1324                                                   interruptible,
1325                                                   urgency, deadline, leeway,
1326                                                   thread);
1327
1328         thread_unlock(thread);
1329         wait_queue_unlock(wqueue);
1330         splx(s);
1331
1332         return (wresult);
1333 }
1334
1335 /*
1336  * thread_isoncpu:
1337  *
1338  * Return TRUE if a thread is running on a processor such that an AST
1339  * is needed to pull it out of userspace execution, or if executing in
1340  * the kernel, bring to a context switch boundary that would cause
1341  * thread state to be serialized in the thread PCB.
1342  *
1343  * Thread locked, returns the same way. While locked, fields
1344  * like "state" cannot change. "runq" can change only from set to unset.
1345  */
1346 static inline boolean_t
1347 thread_isoncpu(thread_t thread)
1348 {
1349         /* Not running or runnable */
1350         if (!(thread->state & TH_RUN))
1351                 return (FALSE);
1352
1353         /* Waiting on a runqueue, not currently running */
1354         /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1355         if (thread->runq != PROCESSOR_NULL)
1356                 return (FALSE);
1357
1358         /*
1359          * Thread must be running on a processor, or
1360          * about to run, or just did run. In all these
1361          * cases, an AST to the processor is needed
1362          * to guarantee that the thread is kicked out
1363          * of userspace and the processor has
1364          * context switched (and saved register state).
1365          */
1366         return (TRUE);
1367 }
1368
1369 /*
1370  * thread_stop:
1371  *
1372  * Force a preemption point for a thread and wait
1373  * for it to stop running on a CPU. If a stronger
1374  * guarantee is requested, wait until no longer
1375  * runnable. Arbitrates access among
1376  * multiple stop requests. (released by unstop)
1377  *
1378  * The thread must enter a wait state and stop via a
1379  * separate means.
1380  *
1381  * Returns FALSE if interrupted.
1382  */
1383 boolean_t
1384 thread_stop(
1385         thread_t                thread,
1386         boolean_t       until_not_runnable)
1387 {
1388         wait_result_t   wresult;
1389         spl_t                   s = splsched();
1390         boolean_t               oncpu;
1391
1392         wake_lock(thread);
1393         thread_lock(thread);
1394
1395         while (thread->state & TH_SUSP) {
1396                 thread->wake_active = TRUE;
1397                 thread_unlock(thread);
1398
1399                 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1400                 wake_unlock(thread);
1401                 splx(s);
1402
1403                 if (wresult == THREAD_WAITING)
1404                         wresult = thread_block(THREAD_CONTINUE_NULL);
1405
1406                 if (wresult != THREAD_AWAKENED)
1407                         return (FALSE);
1408
1409                 s = splsched();
1410                 wake_lock(thread);
1411                 thread_lock(thread);
1412         }
1413
1414         thread->state |= TH_SUSP;
1415
1416         while ((oncpu = thread_isoncpu(thread)) ||
1417                    (until_not_runnable && (thread->state & TH_RUN))) {
1418                 processor_t             processor;
1419
1420                 if (oncpu) {
1421                         assert(thread->state & TH_RUN);
1422                         processor = thread->chosen_processor;
1423                         cause_ast_check(processor);
1424                 }
1425
1426                 thread->wake_active = TRUE;
1427                 thread_unlock(thread);
1428
1429                 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1430                 wake_unlock(thread);
1431                 splx(s);
1432
1433                 if (wresult == THREAD_WAITING)
1434                         wresult = thread_block(THREAD_CONTINUE_NULL);
1435
1436                 if (wresult != THREAD_AWAKENED) {
1437                         thread_unstop(thread);
1438                         return (FALSE);
1439                 }
1440
1441                 s = splsched();
1442                 wake_lock(thread);
1443                 thread_lock(thread);
1444         }
1445
1446         thread_unlock(thread);
1447         wake_unlock(thread);
1448         splx(s);
1449
1450         /*
1451          * We return with the thread unlocked. To prevent it from
1452          * transitioning to a runnable state (or from TH_RUN to
1453          * being on the CPU), the caller must ensure the thread
1454          * is stopped via an external means (such as an AST)
1455          */
1456
1457         return (TRUE);
1458 }
1459
1460 /*
1461  * thread_unstop:
1462  *
1463  * Release a previous stop request and set
1464  * the thread running if appropriate.
1465  *
1466  * Use only after a successful stop operation.
1467  */
1468 void
1469 thread_unstop(
1470         thread_t        thread)
1471 {
1472         spl_t           s = splsched();
1473
1474         wake_lock(thread);
1475         thread_lock(thread);
1476
1477         if ((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) == TH_SUSP) {
1478                 thread->state &= ~TH_SUSP;
1479                 thread_unblock(thread, THREAD_AWAKENED);
1480
1481                 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1482         }
1483         else
1484         if (thread->state & TH_SUSP) {
1485                 thread->state &= ~TH_SUSP;
1486
1487                 if (thread->wake_active) {
1488                         thread->wake_active = FALSE;
1489                         thread_unlock(thread);
1490
1491                         thread_wakeup(&thread->wake_active);
1492                         wake_unlock(thread);
1493                         splx(s);
1494
1495                         return;
1496                 }
1497         }
1498
1499         thread_unlock(thread);
1500         wake_unlock(thread);
1501         splx(s);
1502 }
1503
1504 /*
1505  * thread_wait:
1506  *
1507  * Wait for a thread to stop running. (non-interruptible)
1508  *
1509  */
1510 void
1511 thread_wait(
1512         thread_t        thread,
1513         boolean_t       until_not_runnable)
1514 {
1515         wait_result_t   wresult;
1516         boolean_t       oncpu;
1517         processor_t     processor;
1518         spl_t           s = splsched();
1519
1520         wake_lock(thread);
1521         thread_lock(thread);
1522
1523         /*
1524          * Wait until not running on a CPU.  If stronger requirement
1525          * desired, wait until not runnable.  Assumption: if thread is
1526          * on CPU, then TH_RUN is set, so we're not waiting in any case
1527          * where the original, pure "TH_RUN" check would have let us
1528          * finish.
1529          */
1530         while ((oncpu = thread_isoncpu(thread)) ||
1531                         (until_not_runnable && (thread->state & TH_RUN))) {
1532
1533                 if (oncpu) {
1534                         assert(thread->state & TH_RUN);
1535                         processor = thread->chosen_processor;
1536                         cause_ast_check(processor);
1537                 }
1538
1539                 thread->wake_active = TRUE;
1540                 thread_unlock(thread);
1541
1542                 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1543                 wake_unlock(thread);
1544                 splx(s);
1545
1546                 if (wresult == THREAD_WAITING)
1547                         thread_block(THREAD_CONTINUE_NULL);
1548
1549                 s = splsched();
1550                 wake_lock(thread);
1551                 thread_lock(thread);
1552         }
1553
1554         thread_unlock(thread);
1555         wake_unlock(thread);
1556         splx(s);
1557 }
1558
1559 /*
1560  *      Routine: clear_wait_internal
1561  *
1562  *              Clear the wait condition for the specified thread.
1563  *              Start the thread executing if that is appropriate.
1564  *      Arguments:
1565  *              thread          thread to awaken
1566  *              result          Wakeup result the thread should see
1567  *      Conditions:
1568  *              At splsched
1569  *              the thread is locked.
1570  *      Returns:
1571  *              KERN_SUCCESS            thread was rousted out a wait
1572  *              KERN_FAILURE            thread was waiting but could not be rousted
1573  *              KERN_NOT_WAITING        thread was not waiting
1574  */
1575 __private_extern__ kern_return_t
1576 clear_wait_internal(
1577         thread_t                thread,
1578         wait_result_t   wresult)
1579 {
1580         wait_queue_t    wq = thread->wait_queue;
1581         uint32_t        i = LockTimeOut;
1582
1583         do {
1584                 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1585                         return (KERN_FAILURE);
1586
1587                 if (wq != WAIT_QUEUE_NULL) {
1588                         if (wait_queue_lock_try(wq)) {
1589                                 wait_queue_pull_thread_locked(wq, thread, TRUE);
1590                                 /* wait queue unlocked, thread still locked */
1591                         }
1592                         else {
1593                                 thread_unlock(thread);
1594                                 delay(1);
1595
1596                                 thread_lock(thread);
1597                                 if (wq != thread->wait_queue)
1598                                         return (KERN_NOT_WAITING);
1599
1600                                 continue;
1601                         }
1602                 }
1603
1604                 return (thread_go(thread, wresult));
1605         } while ((--i > 0) || machine_timeout_suspended());
1606
1607         panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
1608                   thread, wq, cpu_number());
1609
1610         return (KERN_FAILURE);
1611 }
1612
1613
1614 /*
1615  *      clear_wait:
1616  *
1617  *      Clear the wait condition for the specified thread.  Start the thread
1618  *      executing if that is appropriate.
1619  *
1620  *      parameters:
1621  *        thread                thread to awaken
1622  *        result                Wakeup result the thread should see
1623  */
1624 kern_return_t
1625 clear_wait(
1626         thread_t                thread,
1627         wait_result_t   result)
1628 {
1629         kern_return_t ret;
1630         spl_t           s;
1631
1632         s = splsched();
1633         thread_lock(thread);
1634         ret = clear_wait_internal(thread, result);
1635         thread_unlock(thread);
1636         splx(s);
1637         return ret;
1638 }
1639
1640
1641 /*
1642  *      thread_wakeup_prim:
1643  *
1644  *      Common routine for thread_wakeup, thread_wakeup_with_result,
1645  *      and thread_wakeup_one.
1646  *
1647  */
1648 kern_return_t
1649 thread_wakeup_prim(
1650         event_t                 event,
1651         boolean_t               one_thread,
1652         wait_result_t           result)
1653 {
1654         return (thread_wakeup_prim_internal(event, one_thread, result, -1));
1655 }
1656
1657
1658 kern_return_t
1659 thread_wakeup_prim_internal(
1660         event_t                 event,
1661         boolean_t               one_thread,
1662         wait_result_t           result,
1663         int                     priority)
1664 {
1665         register wait_queue_t   wq;
1666         register int                    index;
1667
1668         if(event == NO_EVENT)
1669                 panic("thread_wakeup_prim() called with NO_EVENT");
1670
1671         index = wait_hash(event);
1672         wq = &wait_queues[index];
1673         if (one_thread)
1674                 return (wait_queue_wakeup_one(wq, event, result, priority));
1675         else
1676             return (wait_queue_wakeup_all(wq, event, result));
1677 }
1678
1679 /*
1680  *      thread_bind:
1681  *
1682  *      Force the current thread to execute on the specified processor.
1683  *      Takes effect after the next thread_block().
1684  *
1685  *      Returns the previous binding.  PROCESSOR_NULL means
1686  *      not bound.
1687  *
1688  *      XXX - DO NOT export this to users - XXX
1689  */
1690 processor_t
1691 thread_bind(
1692         processor_t             processor)
1693 {
1694         thread_t                self = current_thread();
1695         processor_t             prev;
1696         spl_t                   s;
1697
1698         s = splsched();
1699         thread_lock(self);
1700
1701         /* <rdar://problem/15102234> */
1702         assert(self->sched_pri < BASEPRI_RTQUEUES);
1703
1704         prev = self->bound_processor;
1705         self->bound_processor = processor;
1706
1707         thread_unlock(self);
1708         splx(s);
1709
1710         return (prev);
1711 }
1712
1713 /* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
1714  * rebalancing opportunity exists when a core is (instantaneously) idle, but
1715  * other SMT-capable cores may be over-committed. TODO: some possible negatives:
1716  * IPI thrash if this core does not remain idle following the load balancing ASTs
1717  * Idle "thrash", when IPI issue is followed by idle entry/core power down
1718  * followed by a wakeup shortly thereafter.
1719  */
1720
1721 /* Invoked with pset locked, returns with pset unlocked */
1722 #if (DEVELOPMENT || DEBUG)
1723 int sched_smt_balance = 1;
1724 #endif
1725
1726 static void
1727 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
1728         processor_t ast_processor = NULL;
1729
1730 #if (DEVELOPMENT || DEBUG)
1731         if (__improbable(sched_smt_balance == 0))
1732                 goto smt_balance_exit;
1733 #endif
1734
1735         assert(cprocessor == current_processor());
1736         if (cprocessor->is_SMT == FALSE)
1737                 goto smt_balance_exit;
1738
1739         processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
1740
1741         /* Determine if both this processor and its sibling are idle,
1742          * indicating an SMT rebalancing opportunity.
1743          */
1744         if (sib_processor->state != PROCESSOR_IDLE)
1745                 goto smt_balance_exit;
1746
1747         processor_t sprocessor;
1748
1749         sprocessor = (processor_t)queue_first(&cpset->active_queue);
1750
1751         while (!queue_end(&cpset->active_queue, (queue_entry_t)sprocessor)) {
1752                 if ((sprocessor->state == PROCESSOR_RUNNING) &&
1753                     (sprocessor->processor_primary != sprocessor) &&
1754                     (sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
1755                     (sprocessor->current_pri < BASEPRI_RTQUEUES) &&
1756                     ((cpset->pending_AST_cpu_mask & (1U << sprocessor->cpu_id)) == 0)) {
1757                         assert(sprocessor != cprocessor);
1758                         ast_processor = sprocessor;
1759                         break;
1760                 }
1761                 sprocessor = (processor_t)queue_next((queue_entry_t)sprocessor);
1762         }
1763
1764 smt_balance_exit:
1765         pset_unlock(cpset);
1766
1767         if (ast_processor) {
1768                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
1769                 cause_ast_check(ast_processor);
1770         }
1771 }
1772
1773 /*
1774  *      thread_select:
1775  *
1776  *      Select a new thread for the current processor to execute.
1777  *
1778  *      May select the current thread, which must be locked.
1779  */
1780 static thread_t
1781 thread_select(
1782         thread_t                        thread,
1783         processor_t                     processor,
1784         ast_t                           reason)
1785 {
1786         processor_set_t         pset = processor->processor_set;
1787         thread_t                        new_thread = THREAD_NULL;
1788
1789         assert(processor == current_processor());
1790
1791         do {
1792                 /*
1793                  *      Update the priority.
1794                  */
1795                 if (SCHED(can_update_priority)(thread))
1796                         SCHED(update_priority)(thread);
1797
1798                 processor->current_pri = thread->sched_pri;
1799                 processor->current_thmode = thread->sched_mode;
1800                 processor->current_sfi_class = thread->sfi_class;
1801
1802                 pset_lock(pset);
1803
1804                 assert(processor->state != PROCESSOR_OFF_LINE);
1805
1806                 if (processor->processor_primary != processor) {
1807                         /*
1808                          * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
1809                          * we should look for work only under the same conditions that choose_processor()
1810                          * would have assigned work, which is when all primary processors have been assigned work.
1811                          *
1812                          * An exception is that bound threads are dispatched to a processor without going through
1813                          * choose_processor(), so in those cases we should continue trying to dequeue work.
1814                          */
1815                         if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
1816                                 goto idle;
1817                         }
1818                 }
1819
1820                 simple_lock(&rt_lock);
1821
1822                 /*
1823                  *      Test to see if the current thread should continue
1824                  *      to run on this processor.  Must be runnable, and not
1825                  *      bound to a different processor, nor be in the wrong
1826                  *      processor set.
1827                  */
1828                 if (((thread->state & ~TH_SUSP) == TH_RUN) &&
1829                     (thread->sched_pri >= BASEPRI_RTQUEUES     || processor->processor_primary == processor) &&
1830                     (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)      &&
1831                     (thread->affinity_set == AFFINITY_SET_NULL || thread->affinity_set->aset_pset == pset)) {
1832                         if (thread->sched_pri >= BASEPRI_RTQUEUES && first_timeslice(processor)) {
1833                                 if (rt_runq.count > 0) {
1834                                         thread_t next_rt;
1835
1836                                         next_rt = (thread_t)queue_first(&rt_runq.queue);
1837                                         if (next_rt->realtime.deadline < processor->deadline &&
1838                                            (next_rt->bound_processor == PROCESSOR_NULL || next_rt->bound_processor == processor)) {
1839                                                 thread = (thread_t)dequeue_head(&rt_runq.queue);
1840                                                 thread->runq = PROCESSOR_NULL;
1841                                                 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1842                                                 rt_runq.count--;
1843                                         }
1844                                 }
1845
1846                                 simple_unlock(&rt_lock);
1847
1848                                 processor->deadline = thread->realtime.deadline;
1849
1850                                 pset_unlock(pset);
1851
1852                                 return (thread);
1853                         }
1854
1855                         if ((thread->sched_mode != TH_MODE_FAIRSHARE || SCHED(fairshare_runq_count)() == 0) && (rt_runq.count == 0 || BASEPRI_RTQUEUES < thread->sched_pri) && (new_thread = SCHED(choose_thread)(processor, thread->sched_mode == TH_MODE_FAIRSHARE ? MINPRI : thread->sched_pri, reason)) == THREAD_NULL) {
1856
1857                                 simple_unlock(&rt_lock);
1858
1859                                 /* This thread is still the highest priority runnable (non-idle) thread */
1860
1861                                 processor->deadline = UINT64_MAX;
1862
1863                                 pset_unlock(pset);
1864
1865                                 return (thread);
1866                         }
1867                 }
1868
1869                 if (new_thread != THREAD_NULL ||
1870                                 (SCHED(processor_queue_has_priority)(processor, rt_runq.count == 0 ? IDLEPRI : BASEPRI_RTQUEUES, TRUE) &&
1871                                          (new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL)) {
1872                                 simple_unlock(&rt_lock);
1873
1874                                 processor->deadline = UINT64_MAX;
1875                                 pset_unlock(pset);
1876
1877                                 return (new_thread);
1878                 }
1879
1880                 if (rt_runq.count > 0) {
1881                         thread_t next_rt = (thread_t)queue_first(&rt_runq.queue);
1882
1883                         if (__probable((next_rt->bound_processor == NULL || (next_rt->bound_processor == processor)))) {
1884                                 thread = (thread_t)dequeue_head(&rt_runq.queue);
1885
1886                                 thread->runq = PROCESSOR_NULL;
1887                                 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1888                                 rt_runq.count--;
1889
1890                                 simple_unlock(&rt_lock);
1891
1892                                 processor->deadline = thread->realtime.deadline;
1893                                 pset_unlock(pset);
1894
1895                                 return (thread);
1896                         }
1897                 }
1898
1899                 simple_unlock(&rt_lock);
1900
1901                 /* No realtime threads and no normal threads on the per-processor
1902                  * runqueue. Finally check for global fairshare threads.
1903                  */
1904                 if ((new_thread = SCHED(fairshare_dequeue)()) != THREAD_NULL) {
1905
1906                         processor->deadline = UINT64_MAX;
1907                         pset_unlock(pset);
1908
1909                         return (new_thread);
1910                 }
1911
1912                 processor->deadline = UINT64_MAX;
1913
1914                 /*
1915                  *      No runnable threads, attempt to steal
1916                  *      from other processors.
1917                  */
1918                 new_thread = SCHED(steal_thread)(pset);
1919                 if (new_thread != THREAD_NULL) {
1920                         return (new_thread);
1921                 }
1922
1923                 /*
1924                  *      If other threads have appeared, shortcut
1925                  *      around again.
1926                  */
1927                 if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0 || SCHED(fairshare_runq_count)() > 0)
1928                         continue;
1929
1930                 pset_lock(pset);
1931
1932         idle:
1933                 /*
1934                  *      Nothing is runnable, so set this processor idle if it
1935                  *      was running.
1936                  */
1937                 if (processor->state == PROCESSOR_RUNNING) {
1938                         remqueue((queue_entry_t)processor);
1939                         processor->state = PROCESSOR_IDLE;
1940
1941                         if (processor->processor_primary == processor) {
1942                                 enqueue_head(&pset->idle_queue, (queue_entry_t)processor);
1943                         }
1944                         else {
1945                                 enqueue_head(&pset->idle_secondary_queue, (queue_entry_t)processor);
1946                         }
1947                 }
1948
1949                 /* Invoked with pset locked, returns with pset unlocked */
1950                 sched_SMT_balance(processor, pset);
1951
1952 #if CONFIG_SCHED_IDLE_IN_PLACE
1953                 /*
1954                  *      Choose idle thread if fast idle is not possible.
1955                  */
1956                 if (processor->processor_primary != processor)
1957                         return (processor->idle_thread);
1958
1959                 if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES)
1960                         return (processor->idle_thread);
1961
1962                 /*
1963                  *      Perform idling activities directly without a
1964                  *      context switch.  Return dispatched thread,
1965                  *      else check again for a runnable thread.
1966                  */
1967                 new_thread = thread_select_idle(thread, processor);
1968
1969 #else /* !CONFIG_SCHED_IDLE_IN_PLACE */
1970
1971                 /*
1972                  * Do a full context switch to idle so that the current
1973                  * thread can start running on another processor without
1974                  * waiting for the fast-idled processor to wake up.
1975                  */
1976                 return (processor->idle_thread);
1977
1978 #endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
1979
1980         } while (new_thread == THREAD_NULL);
1981
1982         return (new_thread);
1983 }
1984
1985 #if CONFIG_SCHED_IDLE_IN_PLACE
1986 /*
1987  *      thread_select_idle:
1988  *
1989  *      Idle the processor using the current thread context.
1990  *
1991  *      Called with thread locked, then dropped and relocked.
1992  */
1993 static thread_t
1994 thread_select_idle(
1995         thread_t                thread,
1996         processor_t             processor)
1997 {
1998         thread_t                new_thread;
1999         uint64_t                arg1, arg2;
2000         int                     urgency;
2001
2002         if (thread->sched_mode == TH_MODE_TIMESHARE) {
2003                 if (thread->sched_flags & TH_SFLAG_THROTTLED)
2004                         sched_background_decr(thread);
2005
2006                 sched_share_decr(thread);
2007         }
2008         sched_run_decr(thread);
2009
2010         thread->state |= TH_IDLE;
2011         processor->current_pri = IDLEPRI;
2012         processor->current_thmode = TH_MODE_NONE;
2013         processor->current_sfi_class = SFI_CLASS_KERNEL;
2014
2015         /* Reload precise timing global policy to thread-local policy */
2016         thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2017
2018         thread_unlock(thread);
2019
2020         /*
2021          *      Switch execution timing to processor idle thread.
2022          */
2023         processor->last_dispatch = mach_absolute_time();
2024
2025 #ifdef CONFIG_MACH_APPROXIMATE_TIME
2026         commpage_update_mach_approximate_time(processor->last_dispatch);
2027 #endif
2028
2029         thread->last_run_time = processor->last_dispatch;
2030         thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
2031         PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
2032
2033         /*
2034          *      Cancel the quantum timer while idling.
2035          */
2036         timer_call_cancel(&processor->quantum_timer);
2037         processor->timeslice = 0;
2038
2039         (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2040
2041         thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, NULL);
2042
2043         /*
2044          *      Enable interrupts and perform idling activities.  No
2045          *      preemption due to TH_IDLE being set.
2046          */
2047         spllo(); new_thread = processor_idle(thread, processor);
2048
2049         /*
2050          *      Return at splsched.
2051          */
2052         (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
2053
2054         thread_lock(thread);
2055
2056         /*
2057          *      If awakened, switch to thread timer and start a new quantum.
2058          *      Otherwise skip; we will context switch to another thread or return here.
2059          */
2060         if (!(thread->state & TH_WAIT)) {
2061                 processor->last_dispatch = mach_absolute_time();
2062                 thread_timer_event(processor->last_dispatch, &thread->system_timer);
2063                 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2064
2065                 thread_quantum_init(thread);
2066                 processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
2067                 timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2068                 processor->timeslice = 1;
2069
2070                 thread->computation_epoch = processor->last_dispatch;
2071         }
2072
2073         thread->state &= ~TH_IDLE;
2074
2075         /*
2076          * If we idled in place, simulate a context switch back
2077          * to the original priority of the thread so that the
2078          * platform layer cannot distinguish this from a true
2079          * switch to the idle thread.
2080          */
2081
2082         urgency = thread_get_urgency(thread, &arg1, &arg2);
2083
2084         thread_tell_urgency(urgency, arg1, arg2, new_thread);
2085
2086         sched_run_incr(thread);
2087         if (thread->sched_mode == TH_MODE_TIMESHARE) {
2088                 sched_share_incr(thread);
2089
2090                 if (thread->sched_flags & TH_SFLAG_THROTTLED)
2091                         sched_background_incr(thread);
2092         }
2093
2094         return (new_thread);
2095 }
2096 #endif /* CONFIG_SCHED_IDLE_IN_PLACE */
2097
2098 #if defined(CONFIG_SCHED_TRADITIONAL)
2099 static thread_t
2100 sched_traditional_choose_thread(
2101                                 processor_t     processor,
2102                                 int             priority,
2103                        __unused ast_t           reason)
2104 {
2105         thread_t thread;
2106
2107         thread = choose_thread_from_runq(processor, runq_for_processor(processor), priority);
2108         if (thread != THREAD_NULL) {
2109                 runq_consider_decr_bound_count(processor, thread);
2110         }
2111
2112         return thread;
2113 }
2114
2115 #endif /* defined(CONFIG_SCHED_TRADITIONAL)  */
2116
2117 #if defined(CONFIG_SCHED_TRADITIONAL)
2118
2119 /*
2120  *      choose_thread_from_runq:
2121  *
2122  *      Locate a thread to execute from the processor run queue
2123  *      and return it.  Only choose a thread with greater or equal
2124  *      priority.
2125  *
2126  *      Associated pset must be locked.  Returns THREAD_NULL
2127  *      on failure.
2128  */
2129 thread_t
2130 choose_thread_from_runq(
2131         processor_t             processor,
2132         run_queue_t             rq,
2133         int                             priority)
2134 {
2135         queue_t                 queue = rq->queues + rq->highq;
2136         int                             pri = rq->highq, count = rq->count;
2137         thread_t                thread;
2138
2139         while (count > 0 && pri >= priority) {
2140                 thread = (thread_t)queue_first(queue);
2141                 while (!queue_end(queue, (queue_entry_t)thread)) {
2142                         if (thread->bound_processor == PROCESSOR_NULL ||
2143                                                         thread->bound_processor == processor) {
2144                                 remqueue((queue_entry_t)thread);
2145
2146                                 thread->runq = PROCESSOR_NULL;
2147                                 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2148                                 rq->count--;
2149                                 if (SCHED(priority_is_urgent)(pri)) {
2150                                         rq->urgency--; assert(rq->urgency >= 0);
2151                                 }
2152                                 if (queue_empty(queue)) {
2153                                         if (pri != IDLEPRI)
2154                                                 clrbit(MAXPRI - pri, rq->bitmap);
2155                                         rq->highq = MAXPRI - ffsbit(rq->bitmap);
2156                                 }
2157
2158                                 return (thread);
2159                         }
2160                         count--;
2161
2162                         thread = (thread_t)queue_next((queue_entry_t)thread);
2163                 }
2164
2165                 queue--; pri--;
2166         }
2167
2168         return (THREAD_NULL);
2169 }
2170
2171 #endif /* defined(CONFIG_SCHED_TRADITIONAL) */
2172
2173 /*
2174  *      Perform a context switch and start executing the new thread.
2175  *
2176  *      Returns FALSE on failure, and the thread is re-dispatched.
2177  *
2178  *      Called at splsched.
2179  */
2180
2181 /*
2182  * thread_invoke
2183  *
2184  * "self" is what is currently running on the processor,
2185  * "thread" is the new thread to context switch to
2186  * (which may be the same thread in some cases)
2187  */
2188 static boolean_t
2189 thread_invoke(
2190         thread_t                        self,
2191         thread_t                        thread,
2192         ast_t                           reason)
2193 {
2194         thread_continue_t       continuation = self->continuation;
2195         void                    *parameter = self->parameter;
2196         processor_t             processor;
2197         uint64_t                ctime = mach_absolute_time();
2198
2199 #ifdef CONFIG_MACH_APPROXIMATE_TIME
2200         commpage_update_mach_approximate_time(ctime);
2201 #endif
2202
2203         if (__improbable(get_preemption_level() != 0)) {
2204                 int pl = get_preemption_level();
2205                 panic("thread_invoke: preemption_level %d, possible cause: %s",
2206                     pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2207                         "blocking while holding a spinlock, or within interrupt context"));
2208         }
2209
2210         assert(self == current_thread());
2211         assert(self->runq == PROCESSOR_NULL);
2212
2213 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
2214         sched_traditional_consider_maintenance(ctime);
2215 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
2216
2217         /*
2218          * Mark thread interruptible.
2219          */
2220         thread_lock(thread);
2221         thread->state &= ~TH_UNINT;
2222
2223         assert(thread_runnable(thread));
2224         assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2225         assert(thread->runq == PROCESSOR_NULL);
2226
2227         /* Reload precise timing global policy to thread-local policy */
2228         thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2229
2230         /* Update SFI class based on other factors */
2231         thread->sfi_class = sfi_thread_classify(thread);
2232
2233         /*
2234          * Allow time constraint threads to hang onto
2235          * a stack.
2236          */
2237         if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2238                 self->reserved_stack = self->kernel_stack;
2239
2240         if (continuation != NULL) {
2241                 if (!thread->kernel_stack) {
2242                         /*
2243                          * If we are using a privileged stack,
2244                          * check to see whether we can exchange it with
2245                          * that of the other thread.
2246                          */
2247                         if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
2248                                 goto need_stack;
2249
2250                         /*
2251                          * Context switch by performing a stack handoff.
2252                          */
2253                         continuation = thread->continuation;
2254                         parameter = thread->parameter;
2255
2256                         processor = current_processor();
2257                         processor->active_thread = thread;
2258                         processor->current_pri = thread->sched_pri;
2259                         processor->current_thmode = thread->sched_mode;
2260                         processor->current_sfi_class = thread->sfi_class;
2261                         if (thread->last_processor != processor && thread->last_processor != NULL) {
2262                                 if (thread->last_processor->processor_set != processor->processor_set)
2263                                         thread->ps_switch++;
2264                                 thread->p_switch++;
2265                         }
2266                         thread->last_processor = processor;
2267                         thread->c_switch++;
2268                         ast_context(thread);
2269                         thread_unlock(thread);
2270
2271                         self->reason = reason;
2272
2273                         processor->last_dispatch = ctime;
2274                         self->last_run_time = ctime;
2275                         thread_timer_event(ctime, &thread->system_timer);
2276                         PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2277
2278                         /*
2279                          * Since non-precise user/kernel time doesn't update the state timer
2280                          * during privilege transitions, synthesize an event now.
2281                          */
2282                         if (!thread->precise_user_kernel_time) {
2283                                 timer_switch(PROCESSOR_DATA(processor, current_state),
2284                                                         ctime,
2285                                                          PROCESSOR_DATA(processor, current_state));
2286                         }
2287
2288                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2289                                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
2290                                 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2291
2292                         if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2293                                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2294                                                 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2295                         }
2296
2297                         DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2298
2299                         SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2300
2301                         TLOG(1, "thread_invoke: calling stack_handoff\n");
2302                         stack_handoff(self, thread);
2303
2304                         DTRACE_SCHED(on__cpu);
2305
2306                         thread_dispatch(self, thread);
2307
2308                         thread->continuation = thread->parameter = NULL;
2309
2310                         counter(c_thread_invoke_hits++);
2311
2312                         (void) spllo();
2313
2314                         assert(continuation);
2315                         call_continuation(continuation, parameter, thread->wait_result);
2316                         /*NOTREACHED*/
2317                 }
2318                 else if (thread == self) {
2319                         /* same thread but with continuation */
2320                         ast_context(self);
2321                         counter(++c_thread_invoke_same);
2322                         thread_unlock(self);
2323
2324                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2325                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2326                                 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2327
2328                         self->continuation = self->parameter = NULL;
2329
2330                         (void) spllo();
2331
2332                         call_continuation(continuation, parameter, self->wait_result);
2333                         /*NOTREACHED*/
2334                 }
2335         }
2336         else {
2337                 /*
2338                  * Check that the other thread has a stack
2339                  */
2340                 if (!thread->kernel_stack) {
2341 need_stack:
2342                         if (!stack_alloc_try(thread)) {
2343                                 counter(c_thread_invoke_misses++);
2344                                 thread_unlock(thread);
2345                                 thread_stack_enqueue(thread);
2346                                 return (FALSE);
2347                         }
2348                 }
2349                 else if (thread == self) {
2350                         ast_context(self);
2351                         counter(++c_thread_invoke_same);
2352                         thread_unlock(self);
2353
2354                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2355                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2356                                 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2357
2358                         return (TRUE);
2359                 }
2360         }
2361
2362         /*
2363          * Context switch by full context save.
2364          */
2365         processor = current_processor();
2366         processor->active_thread = thread;
2367         processor->current_pri = thread->sched_pri;
2368         processor->current_thmode = thread->sched_mode;
2369         processor->current_sfi_class = thread->sfi_class;
2370         if (thread->last_processor != processor && thread->last_processor != NULL) {
2371                 if (thread->last_processor->processor_set != processor->processor_set)
2372                         thread->ps_switch++;
2373                 thread->p_switch++;
2374         }
2375         thread->last_processor = processor;
2376         thread->c_switch++;
2377         ast_context(thread);
2378         thread_unlock(thread);
2379
2380         counter(c_thread_invoke_csw++);
2381
2382         assert(self->runq == PROCESSOR_NULL);
2383         self->reason = reason;
2384
2385         processor->last_dispatch = ctime;
2386         self->last_run_time = ctime;
2387         thread_timer_event(ctime, &thread->system_timer);
2388         PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2389
2390         /*
2391          * Since non-precise user/kernel time doesn't update the state timer
2392          * during privilege transitions, synthesize an event now.
2393          */
2394         if (!thread->precise_user_kernel_time) {
2395                 timer_switch(PROCESSOR_DATA(processor, current_state),
2396                                         ctime,
2397                                          PROCESSOR_DATA(processor, current_state));
2398         }
2399
2400         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2401                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2402                 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2403
2404         if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
2405                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2406                                 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2407         }
2408
2409         DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2410
2411         SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2412
2413         /*
2414          * This is where we actually switch register context,
2415          * and address space if required.  We will next run
2416          * as a result of a subsequent context switch.
2417          */
2418         assert(continuation == self->continuation);
2419         thread = machine_switch_context(self, continuation, thread);
2420         assert(self == current_thread());
2421         TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2422
2423         DTRACE_SCHED(on__cpu);
2424
2425         /*
2426          * We have been resumed and are set to run.
2427          */
2428         thread_dispatch(thread, self);
2429
2430         if (continuation) {
2431                 self->continuation = self->parameter = NULL;
2432
2433                 (void) spllo();
2434
2435                 call_continuation(continuation, parameter, self->wait_result);
2436                 /*NOTREACHED*/
2437         }
2438
2439         return (TRUE);
2440 }
2441
2442 /*
2443  *      thread_dispatch:
2444  *
2445  *      Handle threads at context switch.  Re-dispatch other thread
2446  *      if still running, otherwise update run state and perform
2447  *      special actions.  Update quantum for other thread and begin
2448  *      the quantum for ourselves.
2449  *
2450  *     "self" is our new current thread that we have context switched
2451  *     to, "thread" is the old thread that we have switched away from.
2452  *
2453  *      Called at splsched.
2454  */
2455 void
2456 thread_dispatch(
2457         thread_t                thread,
2458         thread_t                self)
2459 {
2460         processor_t             processor = self->last_processor;
2461
2462         if (thread != THREAD_NULL) {
2463                 /*
2464                  *      If blocked at a continuation, discard
2465                  *      the stack.
2466                  */
2467                 if (thread->continuation != NULL && thread->kernel_stack != 0)
2468                         stack_free(thread);
2469
2470                 if (!(thread->state & TH_IDLE)) {
2471                         int64_t consumed;
2472                         int64_t remainder = 0;
2473
2474                         if (processor->quantum_end > processor->last_dispatch)
2475                                 remainder = processor->quantum_end -
2476                                     processor->last_dispatch;
2477
2478                         consumed = thread->quantum_remaining - remainder;
2479
2480                         if ((thread->reason & AST_LEDGER) == 0) {
2481                                 /*
2482                                  * Bill CPU time to both the task and
2483                                  * the individual thread.
2484                                  */
2485                                 ledger_credit(thread->t_ledger,
2486                                     task_ledgers.cpu_time, consumed);
2487                                 ledger_credit(thread->t_threadledger,
2488                                     thread_ledgers.cpu_time, consumed);
2489 #ifdef CONFIG_BANK
2490                                 if (thread->t_bankledger) {
2491                                         ledger_credit(thread->t_bankledger,
2492                                                 bank_ledgers.cpu_time,
2493                                                 (consumed - thread->t_deduct_bank_ledger_time));
2494
2495                                 }
2496                                 thread->t_deduct_bank_ledger_time =0;
2497 #endif
2498                         }
2499
2500                         wake_lock(thread);
2501                         thread_lock(thread);
2502
2503                         /*
2504                          *      Compute remainder of current quantum.
2505                          */
2506                         if (first_timeslice(processor) &&
2507                             processor->quantum_end > processor->last_dispatch)
2508                                 thread->quantum_remaining = (uint32_t)remainder;
2509                         else
2510                                 thread->quantum_remaining = 0;
2511
2512                         if (thread->sched_mode == TH_MODE_REALTIME) {
2513                                 /*
2514                                  *      Cancel the deadline if the thread has
2515                                  *      consumed the entire quantum.
2516                                  */
2517                                 if (thread->quantum_remaining == 0) {
2518                                         thread->realtime.deadline = UINT64_MAX;
2519                                 }
2520                         } else {
2521 #if defined(CONFIG_SCHED_TRADITIONAL)
2522                                 /*
2523                                  *      For non-realtime threads treat a tiny
2524                                  *      remaining quantum as an expired quantum
2525                                  *      but include what's left next time.
2526                                  */
2527                                 if (thread->quantum_remaining < min_std_quantum) {
2528                                         thread->reason |= AST_QUANTUM;
2529                                         thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
2530                                 }
2531 #endif
2532                         }
2533
2534                         /*
2535                          *      If we are doing a direct handoff then
2536                          *      take the remainder of the quantum.
2537                          */
2538                         if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) {
2539                                 self->quantum_remaining = thread->quantum_remaining;
2540                                 thread->reason |= AST_QUANTUM;
2541                                 thread->quantum_remaining = 0;
2542                         } else {
2543 #if defined(CONFIG_SCHED_MULTIQ)
2544                                 if (sched_groups_enabled && thread->sched_group == self->sched_group) {
2545                                         /* TODO: Remove tracepoint */
2546                                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2547                                             MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF) | DBG_FUNC_NONE,
2548                                             self->reason, (uintptr_t)thread_tid(thread),
2549                                             self->quantum_remaining, thread->quantum_remaining, 0);
2550
2551                                         self->quantum_remaining = thread->quantum_remaining;
2552                                         thread->quantum_remaining = 0;
2553                                         /*  TODO: Should we set AST_QUANTUM here? */
2554                                 }
2555 #endif /* defined(CONFIG_SCHED_MULTIQ) */
2556                         }
2557
2558                         thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2559
2560                         if ((thread->rwlock_count != 0) && !(LcksOpts & disLkRWPrio)) {
2561                                 integer_t priority;
2562
2563                                 priority = thread->sched_pri;
2564
2565                                 if (priority < thread->priority)
2566                                         priority = thread->priority;
2567                                 if (priority < BASEPRI_BACKGROUND)
2568                                         priority = BASEPRI_BACKGROUND;
2569
2570                                 if ((thread->sched_pri < priority) || !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2571                                         KERNEL_DEBUG_CONSTANT(
2572                                                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE,
2573                                                 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->priority, priority, 0);
2574
2575                                         thread->sched_flags |= TH_SFLAG_RW_PROMOTED;
2576
2577                                         if (thread->sched_pri < priority)
2578                                                 set_sched_pri(thread, priority);
2579                                 }
2580                         }
2581
2582                         if (!(thread->state & TH_WAIT)) {
2583                                 /*
2584                                  *      Still running.
2585                                  */
2586                                 if (thread->reason & AST_QUANTUM)
2587                                         thread_setrun(thread, SCHED_TAILQ);
2588                                 else
2589                                 if (thread->reason & AST_PREEMPT)
2590                                         thread_setrun(thread, SCHED_HEADQ);
2591                                 else
2592                                         thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
2593
2594                                 thread->reason = AST_NONE;
2595
2596                                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2597                                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2598                                         (uintptr_t)thread_tid(thread), thread->reason, thread->state, sched_run_count, 0);
2599
2600                                 if (thread->wake_active) {
2601                                         thread->wake_active = FALSE;
2602                                         thread_unlock(thread);
2603
2604                                         thread_wakeup(&thread->wake_active);
2605                                 }
2606                                 else
2607                                         thread_unlock(thread);
2608
2609                                 wake_unlock(thread);
2610                         }
2611                         else {
2612                                 /*
2613                                  *      Waiting.
2614                                  */
2615                                 boolean_t should_terminate = FALSE;
2616                                 uint32_t new_run_count;
2617
2618                                 /* Only the first call to thread_dispatch
2619                                  * after explicit termination should add
2620                                  * the thread to the termination queue
2621                                  */
2622                                 if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
2623                                         should_terminate = TRUE;
2624                                         thread->state |= TH_TERMINATE2;
2625                                 }
2626
2627                                 thread->state &= ~TH_RUN;
2628                                 thread->chosen_processor = PROCESSOR_NULL;
2629
2630                                 if (thread->sched_mode == TH_MODE_TIMESHARE) {
2631                                         if (thread->sched_flags & TH_SFLAG_THROTTLED)
2632                                                 sched_background_decr(thread);
2633
2634                                         sched_share_decr(thread);
2635                                 }
2636                                 new_run_count = sched_run_decr(thread);
2637
2638                                 if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) {
2639                                         if (thread->reason & AST_SFI) {
2640                                                 thread->wait_sfi_begin_time = processor->last_dispatch;
2641                                         }
2642                                 }
2643
2644                                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2645                                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2646                                         (uintptr_t)thread_tid(thread), thread->reason, thread->state, new_run_count, 0);
2647
2648                                 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2649
2650                                 if (thread->wake_active) {
2651                                         thread->wake_active = FALSE;
2652                                         thread_unlock(thread);
2653
2654                                         thread_wakeup(&thread->wake_active);
2655                                 }
2656                                 else
2657                                         thread_unlock(thread);
2658
2659                                 wake_unlock(thread);
2660
2661                                 if (should_terminate)
2662                                         thread_terminate_enqueue(thread);
2663                         }
2664                 }
2665         }
2666
2667         if (!(self->state & TH_IDLE)) {
2668                 uint64_t        arg1, arg2;
2669                 int             urgency;
2670                 ast_t                   new_ast;
2671
2672                 thread_lock(self);
2673                 new_ast = sfi_thread_needs_ast(self, NULL);
2674                 thread_unlock(self);
2675
2676                 if (new_ast != AST_NONE) {
2677                         ast_on(new_ast);
2678                 }
2679
2680                 urgency = thread_get_urgency(self, &arg1, &arg2);
2681
2682                 thread_tell_urgency(urgency, arg1, arg2, self);
2683
2684                 /*
2685                  *      Get a new quantum if none remaining.
2686                  */
2687                 if (self->quantum_remaining == 0) {
2688                         thread_quantum_init(self);
2689                 }
2690
2691                 /*
2692                  *      Set up quantum timer and timeslice.
2693                  */
2694                 processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
2695                 timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2696
2697                 processor->timeslice = 1;
2698
2699                 self->computation_epoch = processor->last_dispatch;
2700         }
2701         else {
2702                 timer_call_cancel(&processor->quantum_timer);
2703                 processor->timeslice = 0;
2704
2705                 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, NULL);
2706         }
2707 }
2708
2709 /*
2710  *      thread_block_reason:
2711  *
2712  *      Forces a reschedule, blocking the caller if a wait
2713  *      has been asserted.
2714  *
2715  *      If a continuation is specified, then thread_invoke will
2716  *      attempt to discard the thread's kernel stack.  When the
2717  *      thread resumes, it will execute the continuation function
2718  *      on a new kernel stack.
2719  */
2720 counter(mach_counter_t  c_thread_block_calls = 0;)
2721
2722 wait_result_t
2723 thread_block_reason(
2724         thread_continue_t       continuation,
2725         void                            *parameter,
2726         ast_t                           reason)
2727 {
2728         register thread_t               self = current_thread();
2729         register processor_t    processor;
2730         register thread_t               new_thread;
2731         spl_t                                   s;
2732
2733         counter(++c_thread_block_calls);
2734
2735         s = splsched();
2736
2737         processor = current_processor();
2738
2739         /* If we're explicitly yielding, force a subsequent quantum */
2740         if (reason & AST_YIELD)
2741                 processor->timeslice = 0;
2742
2743         /* We're handling all scheduling AST's */
2744         ast_off(AST_SCHEDULING);
2745
2746         self->continuation = continuation;
2747         self->parameter = parameter;
2748
2749         if (self->state & ~(TH_RUN | TH_IDLE)) {
2750                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2751                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
2752                         reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
2753         }
2754
2755         do {
2756                 thread_lock(self);
2757                 new_thread = thread_select(self, processor, reason);
2758                 thread_unlock(self);
2759         } while (!thread_invoke(self, new_thread, reason));
2760
2761         splx(s);
2762
2763         return (self->wait_result);
2764 }
2765
2766 /*
2767  *      thread_block:
2768  *
2769  *      Block the current thread if a wait has been asserted.
2770  */
2771 wait_result_t
2772 thread_block(
2773         thread_continue_t       continuation)
2774 {
2775         return thread_block_reason(continuation, NULL, AST_NONE);
2776 }
2777
2778 wait_result_t
2779 thread_block_parameter(
2780         thread_continue_t       continuation,
2781         void                            *parameter)
2782 {
2783         return thread_block_reason(continuation, parameter, AST_NONE);
2784 }
2785
2786 /*
2787  *      thread_run:
2788  *
2789  *      Switch directly from the current thread to the
2790  *      new thread, handing off our quantum if appropriate.
2791  *
2792  *      New thread must be runnable, and not on a run queue.
2793  *
2794  *      Called at splsched.
2795  */
2796 int
2797 thread_run(
2798         thread_t                        self,
2799         thread_continue_t       continuation,
2800         void                            *parameter,
2801         thread_t                        new_thread)
2802 {
2803         ast_t           handoff = AST_HANDOFF;
2804
2805         self->continuation = continuation;
2806         self->parameter = parameter;
2807
2808         while (!thread_invoke(self, new_thread, handoff)) {
2809                 processor_t             processor = current_processor();
2810
2811                 thread_lock(self);
2812                 new_thread = thread_select(self, processor, AST_NONE);
2813                 thread_unlock(self);
2814                 handoff = AST_NONE;
2815         }
2816
2817         return (self->wait_result);
2818 }
2819
2820 /*
2821  *      thread_continue:
2822  *
2823  *      Called at splsched when a thread first receives
2824  *      a new stack after a continuation.
2825  */
2826 void
2827 thread_continue(
2828         register thread_t       thread)
2829 {
2830         register thread_t               self = current_thread();
2831         register thread_continue_t      continuation;
2832         register void                   *parameter;
2833
2834         DTRACE_SCHED(on__cpu);
2835
2836         continuation = self->continuation;
2837         parameter = self->parameter;
2838
2839         thread_dispatch(thread, self);
2840
2841         self->continuation = self->parameter = NULL;
2842
2843         if (thread != THREAD_NULL)
2844                 (void)spllo();
2845
2846  TLOG(1, "thread_continue: calling call_continuation \n");
2847         call_continuation(continuation, parameter, self->wait_result);
2848         /*NOTREACHED*/
2849 }
2850
2851 void
2852 thread_quantum_init(thread_t thread)
2853 {
2854         if (thread->sched_mode == TH_MODE_REALTIME) {
2855                 thread->quantum_remaining = thread->realtime.computation;
2856         } else {
2857                 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
2858         }
2859 }
2860
2861 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
2862
2863 uint32_t
2864 sched_traditional_initial_quantum_size(thread_t thread)
2865 {
2866         if ((thread == THREAD_NULL) || !(thread->sched_flags & TH_SFLAG_THROTTLED))
2867                 return std_quantum;
2868         else
2869                 return bg_quantum;
2870 }
2871
2872 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
2873
2874 #if defined(CONFIG_SCHED_TRADITIONAL)
2875
2876 static sched_mode_t
2877 sched_traditional_initial_thread_sched_mode(task_t parent_task)
2878 {
2879         if (parent_task == kernel_task)
2880                 return TH_MODE_FIXED;
2881         else
2882                 return TH_MODE_TIMESHARE;
2883 }
2884
2885 #endif /* CONFIG_SCHED_TRADITIONAL */
2886
2887 /*
2888  *      run_queue_init:
2889  *
2890  *      Initialize a run queue before first use.
2891  */
2892 void
2893 run_queue_init(
2894         run_queue_t             rq)
2895 {
2896         int                             i;
2897
2898         rq->highq = IDLEPRI;
2899         for (i = 0; i < NRQBM; i++)
2900                 rq->bitmap[i] = 0;
2901         setbit(MAXPRI - IDLEPRI, rq->bitmap);
2902         rq->urgency = rq->count = 0;
2903         for (i = 0; i < NRQS; i++)
2904                 queue_init(&rq->queues[i]);
2905 }
2906
2907 #if defined(CONFIG_SCHED_FAIRSHARE_CORE)
2908 int
2909 sched_traditional_fairshare_runq_count(void)
2910 {
2911         return fs_runq.count;
2912 }
2913
2914 uint64_t
2915 sched_traditional_fairshare_runq_stats_count_sum(void)
2916 {
2917         return fs_runq.runq_stats.count_sum;
2918 }
2919
2920 void
2921 sched_traditional_fairshare_enqueue(thread_t thread)
2922 {
2923         queue_t                         queue = &fs_runq.queue;
2924
2925         simple_lock(&fs_lock);
2926
2927         enqueue_tail(queue, (queue_entry_t)thread);
2928
2929         thread->runq = FS_RUNQ;
2930         SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count);
2931         fs_runq.count++;
2932
2933         simple_unlock(&fs_lock);
2934 }
2935
2936 thread_t
2937 sched_traditional_fairshare_dequeue(void)
2938 {
2939         thread_t thread;
2940
2941         simple_lock(&fs_lock);
2942         if (fs_runq.count > 0) {
2943                 thread = (thread_t)dequeue_head(&fs_runq.queue);
2944
2945                 thread->runq = PROCESSOR_NULL;
2946                 SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count);
2947                 fs_runq.count--;
2948
2949                 simple_unlock(&fs_lock);
2950
2951                 return (thread);
2952         }
2953         simple_unlock(&fs_lock);
2954
2955         return THREAD_NULL;
2956 }
2957
2958 boolean_t
2959 sched_traditional_fairshare_queue_remove(thread_t thread)
2960 {
2961         queue_t                 q;
2962
2963         simple_lock(&fs_lock);
2964         q = &fs_runq.queue;
2965
2966         if (FS_RUNQ == thread->runq) {
2967                 remqueue((queue_entry_t)thread);
2968                 SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count);
2969                 fs_runq.count--;
2970
2971                 thread->runq = PROCESSOR_NULL;
2972                 simple_unlock(&fs_lock);
2973                 return (TRUE);
2974         }
2975         else {
2976                 /*
2977                  *      The thread left the run queue before we could
2978                  *      lock the run queue.
2979                  */
2980                 assert(thread->runq == PROCESSOR_NULL);
2981                 simple_unlock(&fs_lock);
2982                 return (FALSE);
2983         }
2984 }
2985
2986 #endif /* CONFIG_SCHED_FAIRSHARE_CORE */
2987
2988 /*
2989  *      run_queue_dequeue:
2990  *
2991  *      Perform a dequeue operation on a run queue,
2992  *      and return the resulting thread.
2993  *
2994  *      The run queue must be locked (see thread_run_queue_remove()
2995  *      for more info), and not empty.
2996  */
2997 thread_t
2998 run_queue_dequeue(
2999         run_queue_t             rq,
3000         integer_t               options)
3001 {
3002         thread_t                thread;
3003         queue_t                 queue = rq->queues + rq->highq;
3004
3005         if (options & SCHED_HEADQ) {
3006                 thread = (thread_t)dequeue_head(queue);
3007         }
3008         else {
3009                 thread = (thread_t)dequeue_tail(queue);
3010         }
3011
3012         thread->runq = PROCESSOR_NULL;
3013         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3014         rq->count--;
3015         if (SCHED(priority_is_urgent)(rq->highq)) {
3016                 rq->urgency--; assert(rq->urgency >= 0);
3017         }
3018         if (queue_empty(queue)) {
3019                 if (rq->highq != IDLEPRI)
3020                         clrbit(MAXPRI - rq->highq, rq->bitmap);
3021                 rq->highq = MAXPRI - ffsbit(rq->bitmap);
3022         }
3023
3024         return (thread);
3025 }
3026
3027 /*
3028  *      run_queue_enqueue:
3029  *
3030  *      Perform a enqueue operation on a run queue.
3031  *
3032  *      The run queue must be locked (see thread_run_queue_remove()
3033  *      for more info).
3034  */
3035 boolean_t
3036 run_queue_enqueue(
3037                                                           run_queue_t           rq,
3038                                                           thread_t                      thread,
3039                                                           integer_t             options)
3040 {
3041         queue_t                 queue = rq->queues + thread->sched_pri;
3042         boolean_t               result = FALSE;
3043
3044         if (queue_empty(queue)) {
3045                 enqueue_tail(queue, (queue_entry_t)thread);
3046
3047                 setbit(MAXPRI - thread->sched_pri, rq->bitmap);
3048                 if (thread->sched_pri > rq->highq) {
3049                         rq->highq = thread->sched_pri;
3050                         result = TRUE;
3051                 }
3052         } else {
3053                 if (options & SCHED_TAILQ)
3054                         enqueue_tail(queue, (queue_entry_t)thread);
3055                 else
3056                         enqueue_head(queue, (queue_entry_t)thread);
3057         }
3058         if (SCHED(priority_is_urgent)(thread->sched_pri))
3059                 rq->urgency++;
3060         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3061         rq->count++;
3062
3063         return (result);
3064
3065 }
3066
3067 /*
3068  *      run_queue_remove:
3069  *
3070  *      Remove a specific thread from a runqueue.
3071  *
3072  *      The run queue must be locked.
3073  */
3074 void
3075 run_queue_remove(
3076                                   run_queue_t           rq,
3077                                   thread_t                      thread)
3078 {
3079
3080         remqueue((queue_entry_t)thread);
3081         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3082         rq->count--;
3083         if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3084                 rq->urgency--; assert(rq->urgency >= 0);
3085         }
3086
3087         if (queue_empty(rq->queues + thread->sched_pri)) {
3088                 /* update run queue status */
3089                 if (thread->sched_pri != IDLEPRI)
3090                         clrbit(MAXPRI - thread->sched_pri, rq->bitmap);
3091                 rq->highq = MAXPRI - ffsbit(rq->bitmap);
3092         }
3093
3094         thread->runq = PROCESSOR_NULL;
3095 }
3096
3097 /*
3098  *      fairshare_setrun:
3099  *
3100  *      Dispatch a thread for round-robin execution.
3101  *
3102  *      Thread must be locked.  Associated pset must
3103  *      be locked, and is returned unlocked.
3104  */
3105 static void
3106 fairshare_setrun(
3107                                   processor_t                   processor,
3108                                   thread_t                      thread)
3109 {
3110         processor_set_t         pset = processor->processor_set;
3111
3112         thread->chosen_processor = processor;
3113
3114         SCHED(fairshare_enqueue)(thread);
3115
3116         pset_unlock(pset);
3117
3118         if (processor != current_processor())
3119                 machine_signal_idle(processor);
3120
3121
3122 }
3123
3124 /*
3125  *      realtime_queue_insert:
3126  *
3127  *      Enqueue a thread for realtime execution.
3128  */
3129 static boolean_t
3130 realtime_queue_insert(
3131         thread_t                        thread)
3132 {
3133         queue_t                         queue = &rt_runq.queue;
3134         uint64_t                        deadline = thread->realtime.deadline;
3135         boolean_t                       preempt = FALSE;
3136
3137         simple_lock(&rt_lock);
3138
3139         if (queue_empty(queue)) {
3140                 enqueue_tail(queue, (queue_entry_t)thread);
3141                 preempt = TRUE;
3142         }
3143         else {
3144                 register thread_t       entry = (thread_t)queue_first(queue);
3145
3146                 while (TRUE) {
3147                         if (    queue_end(queue, (queue_entry_t)entry)  ||
3148                                                 deadline < entry->realtime.deadline             ) {
3149                                 entry = (thread_t)queue_prev((queue_entry_t)entry);
3150                                 break;
3151                         }
3152
3153                         entry = (thread_t)queue_next((queue_entry_t)entry);
3154                 }
3155
3156                 if ((queue_entry_t)entry == queue)
3157                         preempt = TRUE;
3158
3159                 insque((queue_entry_t)thread, (queue_entry_t)entry);
3160         }
3161
3162         thread->runq = RT_RUNQ;
3163         SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
3164         rt_runq.count++;
3165
3166         simple_unlock(&rt_lock);
3167
3168         return (preempt);
3169 }
3170
3171 /*
3172  *      realtime_setrun:
3173  *
3174  *      Dispatch a thread for realtime execution.
3175  *
3176  *      Thread must be locked.  Associated pset must
3177  *      be locked, and is returned unlocked.
3178  */
3179 static void
3180 realtime_setrun(
3181         processor_t                     processor,
3182         thread_t                        thread)
3183 {
3184         processor_set_t         pset = processor->processor_set;
3185         ast_t                           preempt;
3186
3187         boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
3188
3189         thread->chosen_processor = processor;
3190
3191         /* <rdar://problem/15102234> */
3192         assert(thread->bound_processor == PROCESSOR_NULL);
3193
3194         /*
3195          *      Dispatch directly onto idle processor.
3196          */
3197         if ( (thread->bound_processor == processor)
3198                 && processor->state == PROCESSOR_IDLE) {
3199                 remqueue((queue_entry_t)processor);
3200                 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3201
3202                 processor->next_thread = thread;
3203                 processor->current_pri = thread->sched_pri;
3204                 processor->current_thmode = thread->sched_mode;
3205                 processor->current_sfi_class = thread->sfi_class;
3206                 processor->deadline = thread->realtime.deadline;
3207                 processor->state = PROCESSOR_DISPATCHING;
3208
3209                 if (processor != current_processor()) {
3210                         if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3211                                 /* cleared on exit from main processor_idle() loop */
3212                                 pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3213                                 do_signal_idle = TRUE;
3214                         }
3215                 }
3216                 pset_unlock(pset);
3217
3218                 if (do_signal_idle) {
3219                         machine_signal_idle(processor);
3220                 }
3221                 return;
3222         }
3223
3224         if (processor->current_pri < BASEPRI_RTQUEUES)
3225                 preempt = (AST_PREEMPT | AST_URGENT);
3226         else if (thread->realtime.deadline < processor->deadline)
3227                 preempt = (AST_PREEMPT | AST_URGENT);
3228         else
3229                 preempt = AST_NONE;
3230
3231         realtime_queue_insert(thread);
3232
3233         if (preempt != AST_NONE) {
3234                 if (processor->state == PROCESSOR_IDLE) {
3235                         remqueue((queue_entry_t)processor);
3236                         enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3237                         processor->next_thread = THREAD_NULL;
3238                         processor->current_pri = thread->sched_pri;
3239                         processor->current_thmode = thread->sched_mode;
3240                         processor->current_sfi_class = thread->sfi_class;
3241                         processor->deadline = thread->realtime.deadline;
3242                         processor->state = PROCESSOR_DISPATCHING;
3243                         if (processor == current_processor()) {
3244                                 ast_on(preempt);
3245                         } else {
3246                                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3247                                         /* cleared on exit from main processor_idle() loop */
3248                                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3249                                         do_signal_idle = TRUE;
3250                                 }
3251                         }
3252                 } else if (processor->state == PROCESSOR_DISPATCHING) {
3253                         if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
3254                                 processor->current_pri = thread->sched_pri;
3255                                 processor->current_thmode = thread->sched_mode;
3256                                 processor->current_sfi_class = thread->sfi_class;
3257                                 processor->deadline = thread->realtime.deadline;
3258                         }
3259                 } else {
3260                         if (processor == current_processor()) {
3261                                 ast_on(preempt);
3262                         } else {
3263                                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3264                                         /* cleared after IPI causes csw_check() to be called */
3265                                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3266                                         do_cause_ast = TRUE;
3267                                 }
3268                         }
3269                 }
3270         } else {
3271                 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
3272         }
3273
3274         pset_unlock(pset);
3275
3276         if (do_signal_idle) {
3277                 machine_signal_idle(processor);
3278         } else if (do_cause_ast) {
3279                 cause_ast_check(processor);
3280         }
3281 }
3282
3283
3284 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3285
3286 boolean_t
3287 priority_is_urgent(int priority)
3288 {
3289         return testbit(priority, sched_preempt_pri) ? TRUE : FALSE;
3290 }
3291
3292 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3293
3294 #if defined(CONFIG_SCHED_TRADITIONAL)
3295 /*
3296  *      processor_enqueue:
3297  *
3298  *      Enqueue thread on a processor run queue.  Thread must be locked,
3299  *      and not already be on a run queue.
3300  *
3301  *      Returns TRUE if a preemption is indicated based on the state
3302  *      of the run queue.
3303  *
3304  *      The run queue must be locked (see thread_run_queue_remove()
3305  *      for more info).
3306  */
3307 static boolean_t
3308 processor_enqueue(
3309         processor_t             processor,
3310         thread_t                thread,
3311         integer_t               options)
3312 {
3313         run_queue_t             rq = runq_for_processor(processor);
3314         boolean_t               result;
3315
3316         result = run_queue_enqueue(rq, thread, options);
3317         thread->runq = processor;
3318         runq_consider_incr_bound_count(processor, thread);
3319
3320         return (result);
3321 }
3322
3323 #endif /* CONFIG_SCHED_TRADITIONAL */
3324
3325 /*
3326  *      processor_setrun:
3327  *
3328  *      Dispatch a thread for execution on a
3329  *      processor.
3330  *
3331  *      Thread must be locked.  Associated pset must
3332  *      be locked, and is returned unlocked.
3333  */
3334 static void
3335 processor_setrun(
3336         processor_t                     processor,
3337         thread_t                        thread,
3338         integer_t                       options)
3339 {
3340         processor_set_t         pset = processor->processor_set;
3341         ast_t                           preempt;
3342         enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
3343
3344         boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
3345
3346         thread->chosen_processor = processor;
3347
3348         /*
3349          *      Dispatch directly onto idle processor.
3350          */
3351         if ( (SCHED(direct_dispatch_to_idle_processors) ||
3352                   thread->bound_processor == processor)
3353                 && processor->state == PROCESSOR_IDLE) {
3354                 remqueue((queue_entry_t)processor);
3355                 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3356
3357                 processor->next_thread = thread;
3358                 processor->current_pri = thread->sched_pri;
3359                 processor->current_thmode = thread->sched_mode;
3360                 processor->current_sfi_class = thread->sfi_class;
3361                 processor->deadline = UINT64_MAX;
3362                 processor->state = PROCESSOR_DISPATCHING;
3363
3364                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3365                         /* cleared on exit from main processor_idle() loop */
3366                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3367                         do_signal_idle = TRUE;
3368                 }
3369
3370                 pset_unlock(pset);
3371                 if (do_signal_idle) {
3372                         machine_signal_idle(processor);
3373                 }
3374
3375                 return;
3376         }
3377
3378         /*
3379          *      Set preemption mode.
3380          */
3381         if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3382                 preempt = (AST_PREEMPT | AST_URGENT);
3383         else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
3384                 preempt = (AST_PREEMPT | AST_URGENT);
3385         else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->priority)) {
3386                 if(SCHED(priority_is_urgent)(thread->priority) && thread->sched_pri > processor->current_pri) {
3387                         preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3388                 } else {
3389                         preempt = AST_NONE;
3390                 }
3391         } else
3392                 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3393
3394         SCHED(processor_enqueue)(processor, thread, options);
3395
3396         if (preempt != AST_NONE) {
3397                 if (processor->state == PROCESSOR_IDLE) {
3398                         remqueue((queue_entry_t)processor);
3399                         enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3400                         processor->next_thread = THREAD_NULL;
3401                         processor->current_pri = thread->sched_pri;
3402                         processor->current_thmode = thread->sched_mode;
3403                         processor->current_sfi_class = thread->sfi_class;
3404                         processor->deadline = UINT64_MAX;
3405                         processor->state = PROCESSOR_DISPATCHING;
3406
3407                         ipi_action = eExitIdle;
3408                 } else if ( processor->state == PROCESSOR_DISPATCHING) {
3409                         if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
3410                                 processor->current_pri = thread->sched_pri;
3411                                 processor->current_thmode = thread->sched_mode;
3412                                 processor->current_sfi_class = thread->sfi_class;
3413                                 processor->deadline = UINT64_MAX;
3414                         }
3415                 } else if (     (processor->state == PROCESSOR_RUNNING          ||
3416                                  processor->state == PROCESSOR_SHUTDOWN)                &&
3417                                 (thread->sched_pri >= processor->current_pri    ||
3418                                 processor->current_thmode == TH_MODE_FAIRSHARE)) {
3419                         ipi_action = eInterruptRunning;
3420                 }
3421         } else {
3422                 /*
3423                  * New thread is not important enough to preempt what is running, but
3424                  * special processor states may need special handling
3425                  */
3426                 if (processor->state == PROCESSOR_SHUTDOWN              &&
3427                         thread->sched_pri >= processor->current_pri     ) {
3428                         ipi_action = eInterruptRunning;
3429                 } else if (     processor->state == PROCESSOR_IDLE      &&
3430                                         processor != current_processor()        ) {
3431                         remqueue((queue_entry_t)processor);
3432                         enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3433                         processor->next_thread = THREAD_NULL;
3434                         processor->current_pri = thread->sched_pri;
3435                         processor->current_thmode = thread->sched_mode;
3436                         processor->current_sfi_class = thread->sfi_class;
3437                         processor->deadline = UINT64_MAX;
3438                         processor->state = PROCESSOR_DISPATCHING;
3439
3440                         ipi_action = eExitIdle;
3441                 }
3442         }
3443
3444         switch (ipi_action) {
3445                 case eDoNothing:
3446                         break;
3447                 case eExitIdle:
3448                         if (processor == current_processor()) {
3449                                 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3450                                         ast_on(preempt);
3451                         } else {
3452                                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3453                                         /* cleared on exit from main processor_idle() loop */
3454                                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3455                                         do_signal_idle = TRUE;
3456                                 }
3457                         }
3458                         break;
3459                 case eInterruptRunning:
3460                         if (processor == current_processor()) {
3461                                 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3462                                         ast_on(preempt);
3463                         } else {
3464                                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3465                                         /* cleared after IPI causes csw_check() to be called */
3466                                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3467                                         do_cause_ast = TRUE;
3468                                 }
3469                         }
3470                         break;
3471         }
3472
3473         pset_unlock(pset);
3474
3475         if (do_signal_idle) {
3476                 machine_signal_idle(processor);
3477         } else if (do_cause_ast) {
3478                 cause_ast_check(processor);
3479         }
3480 }
3481
3482 #if defined(CONFIG_SCHED_TRADITIONAL)
3483
3484 static boolean_t
3485 processor_queue_empty(processor_t               processor)
3486 {
3487         return runq_for_processor(processor)->count == 0;
3488
3489 }
3490
3491 static boolean_t
3492 sched_traditional_with_pset_runqueue_processor_queue_empty(processor_t          processor)
3493 {
3494         processor_set_t pset = processor->processor_set;
3495         int count = runq_for_processor(processor)->count;
3496
3497         /*
3498          * The pset runq contains the count of all runnable threads
3499          * for all processors in the pset. However, for threads that
3500          * are bound to another processor, the current "processor"
3501          * is not eligible to execute the thread. So we only
3502          * include bound threads that our bound to the current
3503          * "processor". This allows the processor to idle when the
3504          * count of eligible threads drops to 0, even if there's
3505          * a runnable thread bound to a different processor in the
3506          * shared runq.
3507          */
3508
3509         count -= pset->pset_runq_bound_count;
3510         count += processor->runq_bound_count;
3511
3512         return count == 0;
3513 }
3514
3515 static ast_t
3516 processor_csw_check(processor_t processor)
3517 {
3518         run_queue_t             runq;
3519         boolean_t               has_higher;
3520
3521         assert(processor->active_thread != NULL);
3522
3523         runq = runq_for_processor(processor);
3524         if (first_timeslice(processor)) {
3525                 has_higher = (runq->highq > processor->current_pri);
3526         } else {
3527                 has_higher = (runq->highq >= processor->current_pri);
3528         }
3529         if (has_higher) {
3530                 if (runq->urgency > 0)
3531                         return (AST_PREEMPT | AST_URGENT);
3532
3533                 if (processor->active_thread && thread_eager_preemption(processor->active_thread))
3534                         return (AST_PREEMPT | AST_URGENT);
3535
3536                 return AST_PREEMPT;
3537         }
3538
3539         return AST_NONE;
3540 }
3541
3542 static boolean_t
3543 processor_queue_has_priority(processor_t                processor,
3544                                                          int                            priority,
3545                                                          boolean_t                      gte)
3546 {
3547         if (gte)
3548                 return runq_for_processor(processor)->highq >= priority;
3549         else
3550                 return runq_for_processor(processor)->highq > priority;
3551 }
3552
3553 static boolean_t
3554 should_current_thread_rechoose_processor(processor_t                    processor)
3555 {
3556         return (processor->current_pri < BASEPRI_RTQUEUES
3557                         && processor->processor_primary != processor);
3558 }
3559
3560 static int
3561 sched_traditional_processor_runq_count(processor_t   processor)
3562 {
3563         return runq_for_processor(processor)->count;
3564 }
3565
3566 static uint64_t
3567 sched_traditional_processor_runq_stats_count_sum(processor_t   processor)
3568 {
3569         return runq_for_processor(processor)->runq_stats.count_sum;
3570 }
3571
3572 static uint64_t
3573 sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum(processor_t   processor)
3574 {
3575         if (processor->cpu_id == processor->processor_set->cpu_set_low)
3576                 return runq_for_processor(processor)->runq_stats.count_sum;
3577         else
3578                 return 0ULL;
3579 }
3580
3581 static int
3582 sched_traditional_processor_bound_count(processor_t   processor)
3583 {
3584         return processor->runq_bound_count;
3585 }
3586
3587 #endif /* CONFIG_SCHED_TRADITIONAL */
3588
3589 /*
3590  *      choose_next_pset:
3591  *
3592  *      Return the next sibling pset containing
3593  *      available processors.
3594  *
3595  *      Returns the original pset if none other is
3596  *      suitable.
3597  */
3598 static processor_set_t
3599 choose_next_pset(
3600         processor_set_t         pset)
3601 {
3602         processor_set_t         nset = pset;
3603
3604         do {
3605                 nset = next_pset(nset);
3606         } while (nset->online_processor_count < 1 && nset != pset);
3607
3608         return (nset);
3609 }
3610
3611 /*
3612  *      choose_processor:
3613  *
3614  *      Choose a processor for the thread, beginning at
3615  *      the pset.  Accepts an optional processor hint in
3616  *      the pset.
3617  *
3618  *      Returns a processor, possibly from a different pset.
3619  *
3620  *      The thread must be locked.  The pset must be locked,
3621  *      and the resulting pset is locked on return.
3622  */
3623 processor_t
3624 choose_processor(
3625         processor_set_t         pset,
3626         processor_t                     processor,
3627         thread_t                        thread)
3628 {
3629         processor_set_t         nset, cset = pset;
3630
3631         /*
3632          * Prefer the hinted processor, when appropriate.
3633          */
3634
3635         /* Fold last processor hint from secondary processor to its primary */
3636         if (processor != PROCESSOR_NULL) {
3637                 processor = processor->processor_primary;
3638         }
3639
3640         /*
3641          * Only consult platform layer if pset is active, which
3642          * it may not be in some cases when a multi-set system
3643          * is going to sleep.
3644          */
3645         if (pset->online_processor_count) {
3646                 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
3647                         processor_t mc_processor = machine_choose_processor(pset, processor);
3648                         if (mc_processor != PROCESSOR_NULL)
3649                                 processor = mc_processor->processor_primary;
3650                 }
3651         }
3652
3653         /*
3654          * At this point, we may have a processor hint, and we may have
3655          * an initial starting pset. If the hint is not in the pset, or
3656          * if the hint is for a processor in an invalid state, discard
3657          * the hint.
3658          */
3659         if (processor != PROCESSOR_NULL) {
3660                 if (processor->processor_set != pset) {
3661                         processor = PROCESSOR_NULL;
3662                 } else {
3663                         switch (processor->state) {
3664                                 case PROCESSOR_START:
3665                                 case PROCESSOR_SHUTDOWN:
3666                                 case PROCESSOR_OFF_LINE:
3667                                         /*
3668                                          * Hint is for a processor that cannot support running new threads.
3669                                          */
3670                                         processor = PROCESSOR_NULL;
3671                                         break;
3672                                 case PROCESSOR_IDLE:
3673                                         /*
3674                                          * Hint is for an idle processor. Assume it is no worse than any other
3675                                          * idle processor. The platform layer had an opportunity to provide
3676                                          * the "least cost idle" processor above.
3677                                          */
3678                                         return (processor);
3679                                         break;
3680                                 case PROCESSOR_RUNNING:
3681                                 case PROCESSOR_DISPATCHING:
3682                                         /*
3683                                          * Hint is for an active CPU. This fast-path allows
3684                                          * realtime threads to preempt non-realtime threads
3685                                          * to regain their previous executing processor.
3686                                          */
3687                                         if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3688                                                 (processor->current_pri < BASEPRI_RTQUEUES))
3689                                                 return (processor);
3690
3691                                         /* Otherwise, use hint as part of search below */
3692                                         break;
3693                                 default:
3694                                         processor = PROCESSOR_NULL;
3695                                         break;
3696                         }
3697                 }
3698         }
3699
3700         /*
3701          * Iterate through the processor sets to locate
3702          * an appropriate processor. Seed results with
3703          * a last-processor hint, if available, so that
3704          * a search must find something strictly better
3705          * to replace it.
3706          *
3707          * A primary/secondary pair of SMT processors are
3708          * "unpaired" if the primary is busy but its
3709          * corresponding secondary is idle (so the physical
3710          * core has full use of its resources).
3711          */
3712
3713         integer_t lowest_priority = MAXPRI + 1;
3714         integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
3715         integer_t lowest_count = INT_MAX;
3716         uint64_t  furthest_deadline = 1;
3717         processor_t lp_processor = PROCESSOR_NULL;
3718         processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
3719         processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
3720         processor_t lc_processor = PROCESSOR_NULL;
3721         processor_t fd_processor = PROCESSOR_NULL;
3722
3723         if (processor != PROCESSOR_NULL) {
3724                 /* All other states should be enumerated above. */
3725                 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
3726
3727                 lowest_priority = processor->current_pri;
3728                 lp_processor = processor;
3729
3730                 if (processor->current_pri >= BASEPRI_RTQUEUES) {
3731                         furthest_deadline = processor->deadline;
3732                         fd_processor = processor;
3733                 }
3734
3735                 lowest_count = SCHED(processor_runq_count)(processor);
3736                 lc_processor = processor;
3737         }
3738
3739         do {
3740
3741                 /*
3742                  * Choose an idle processor, in pset traversal order
3743                  */
3744                 if (!queue_empty(&cset->idle_queue))
3745                         return ((processor_t)queue_first(&cset->idle_queue));
3746
3747                 /*
3748                  * Otherwise, enumerate active and idle processors to find candidates
3749                  * with lower priority/etc.
3750                  */
3751
3752                 processor = (processor_t)queue_first(&cset->active_queue);
3753                 while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) {
3754
3755                         integer_t cpri = processor->current_pri;
3756                         if (cpri < lowest_priority) {
3757                                 lowest_priority = cpri;
3758                                 lp_processor = processor;
3759                         }
3760
3761                         if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3762                                 furthest_deadline = processor->deadline;
3763                                 fd_processor = processor;
3764                         }
3765
3766                         integer_t ccount = SCHED(processor_runq_count)(processor);
3767                         if (ccount < lowest_count) {
3768                                 lowest_count = ccount;
3769                                 lc_processor = processor;
3770                         }
3771
3772                         processor = (processor_t)queue_next((queue_entry_t)processor);
3773                 }
3774
3775                 /*
3776                  * For SMT configs, these idle secondary processors must have active primary. Otherwise
3777                  * the idle primary would have short-circuited the loop above
3778                  */
3779                 processor = (processor_t)queue_first(&cset->idle_secondary_queue);
3780                 while (!queue_end(&cset->idle_secondary_queue, (queue_entry_t)processor)) {
3781                         processor_t cprimary = processor->processor_primary;
3782
3783                         /* If the primary processor is offline or starting up, it's not a candidate for this path */
3784                         if (cprimary->state == PROCESSOR_RUNNING || cprimary->state == PROCESSOR_DISPATCHING) {
3785                                 integer_t primary_pri = cprimary->current_pri;
3786
3787                                 if (primary_pri < lowest_unpaired_primary_priority) {
3788                                         lowest_unpaired_primary_priority = primary_pri;
3789                                         lp_unpaired_primary_processor = cprimary;
3790                                         lp_unpaired_secondary_processor = processor;
3791                                 }
3792                         }
3793
3794                         processor = (processor_t)queue_next((queue_entry_t)processor);
3795                 }
3796
3797
3798                 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3799
3800                         /*
3801                          * For realtime threads, the most important aspect is
3802                          * scheduling latency, so we attempt to assign threads
3803                          * to good preemption candidates (assuming an idle primary
3804                          * processor was not available above).
3805                          */
3806
3807                         if (thread->sched_pri > lowest_unpaired_primary_priority) {
3808                                 /* Move to end of active queue so that the next thread doesn't also pick it */
3809                                 remqueue((queue_entry_t)lp_unpaired_primary_processor);
3810                                 enqueue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor);
3811                                 return lp_unpaired_primary_processor;
3812                         }
3813                         if (thread->sched_pri > lowest_priority) {
3814                                 /* Move to end of active queue so that the next thread doesn't also pick it */
3815                                 remqueue((queue_entry_t)lp_processor);
3816                                 enqueue_tail(&cset->active_queue, (queue_entry_t)lp_processor);
3817                                 return lp_processor;
3818                         }
3819                         if (thread->realtime.deadline < furthest_deadline)
3820                                 return fd_processor;
3821
3822                         /*
3823                          * If all primary and secondary CPUs are busy with realtime
3824                          * threads with deadlines earlier than us, move on to next
3825                          * pset.
3826                          */
3827                 }
3828                 else {
3829
3830                         if (thread->sched_pri > lowest_unpaired_primary_priority) {
3831                                 /* Move to end of active queue so that the next thread doesn't also pick it */
3832                                 remqueue((queue_entry_t)lp_unpaired_primary_processor);
3833                                 enqueue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor);
3834                                 return lp_unpaired_primary_processor;
3835                         }
3836                         if (thread->sched_pri > lowest_priority) {
3837                                 /* Move to end of active queue so that the next thread doesn't also pick it */
3838                                 remqueue((queue_entry_t)lp_processor);
3839                                 enqueue_tail(&cset->active_queue, (queue_entry_t)lp_processor);
3840                                 return lp_processor;
3841                         }
3842
3843                         /*
3844                          * If all primary processor in this pset are running a higher
3845                          * priority thread, move on to next pset. Only when we have
3846                          * exhausted this search do we fall back to other heuristics.
3847                          */
3848                 }
3849
3850                 /*
3851                  * Move onto the next processor set.
3852                  */
3853                 nset = next_pset(cset);
3854
3855                 if (nset != pset) {
3856                         pset_unlock(cset);
3857
3858                         cset = nset;
3859                         pset_lock(cset);
3860                 }
3861         } while (nset != pset);
3862
3863         /*
3864          * Make sure that we pick a running processor,
3865          * and that the correct processor set is locked.
3866          * Since we may have unlock the candidate processor's
3867          * pset, it may have changed state.
3868          *
3869          * All primary processors are running a higher priority
3870          * thread, so the only options left are enqueuing on
3871          * the secondary processor that would perturb the least priority
3872          * primary, or the least busy primary.
3873          */
3874         do {
3875
3876                 /* lowest_priority is evaluated in the main loops above */
3877                 if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
3878                         processor = lp_unpaired_secondary_processor;
3879                         lp_unpaired_secondary_processor = PROCESSOR_NULL;
3880                 } else if (lc_processor != PROCESSOR_NULL) {
3881                         processor = lc_processor;
3882                         lc_processor = PROCESSOR_NULL;
3883                 } else {
3884                         /*
3885                          * All processors are executing higher
3886                          * priority threads, and the lowest_count
3887                          * candidate was not usable
3888                          */
3889                         processor = master_processor;
3890                 }
3891
3892                 /*
3893                  * Check that the correct processor set is
3894                  * returned locked.
3895                  */
3896                 if (cset != processor->processor_set) {
3897                         pset_unlock(cset);
3898                         cset = processor->processor_set;
3899                         pset_lock(cset);
3900                 }
3901
3902                 /*
3903                  * We must verify that the chosen processor is still available.
3904                  * master_processor is an exception, since we may need to preempt
3905                  * a running thread on it during processor shutdown (for sleep),
3906                  * and that thread needs to be enqueued on its runqueue to run
3907                  * when the processor is restarted.
3908                  */
3909                 if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE))
3910                         processor = PROCESSOR_NULL;
3911
3912         } while (processor == PROCESSOR_NULL);
3913
3914         return (processor);
3915 }
3916
3917 /*
3918  *      thread_setrun:
3919  *
3920  *      Dispatch thread for execution, onto an idle
3921  *      processor or run queue, and signal a preemption
3922  *      as appropriate.
3923  *
3924  *      Thread must be locked.
3925  */
3926 void
3927 thread_setrun(
3928         thread_t                        thread,
3929         integer_t                       options)
3930 {
3931         processor_t                     processor;
3932         processor_set_t         pset;
3933
3934         assert(thread_runnable(thread));
3935
3936         /*
3937          *      Update priority if needed.
3938          */
3939         if (SCHED(can_update_priority)(thread))
3940                 SCHED(update_priority)(thread);
3941
3942         thread->sfi_class = sfi_thread_classify(thread);
3943
3944         assert(thread->runq == PROCESSOR_NULL);
3945
3946         if (thread->bound_processor == PROCESSOR_NULL) {
3947                 /*
3948                  *      Unbound case.
3949                  */
3950                 if (thread->affinity_set != AFFINITY_SET_NULL) {
3951                         /*
3952                          * Use affinity set policy hint.
3953                          */
3954                         pset = thread->affinity_set->aset_pset;
3955                         pset_lock(pset);
3956
3957                         processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3958
3959                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3960                                                                           (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3961                 }
3962                 else
3963                 if (thread->last_processor != PROCESSOR_NULL) {
3964                         /*
3965                          *      Simple (last processor) affinity case.
3966                          */
3967                         processor = thread->last_processor;
3968                         pset = processor->processor_set;
3969                         pset_lock(pset);
3970                         processor = SCHED(choose_processor)(pset, processor, thread);
3971
3972                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3973                                                                   (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
3974                 }
3975                 else {
3976                         /*
3977                          *      No Affinity case:
3978                          *
3979                          *      Utilitize a per task hint to spread threads
3980                          *      among the available processor sets.
3981                          */
3982                         task_t          task = thread->task;
3983
3984                         pset = task->pset_hint;
3985                         if (pset == PROCESSOR_SET_NULL)
3986                                 pset = current_processor()->processor_set;
3987
3988                         pset = choose_next_pset(pset);
3989                         pset_lock(pset);
3990
3991                         processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3992                         task->pset_hint = processor->processor_set;
3993
3994                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3995                                                                           (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3996                 }
3997         }
3998         else {
3999                 /*
4000                  *      Bound case:
4001                  *
4002                  *      Unconditionally dispatch on the processor.
4003                  */
4004                 processor = thread->bound_processor;
4005                 pset = processor->processor_set;
4006                 pset_lock(pset);
4007
4008                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
4009                                                           (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
4010         }
4011
4012         /*
4013          *      Dispatch the thread on the choosen processor.
4014          *      TODO: This should be based on sched_mode, not sched_pri
4015          */
4016         if (thread->sched_pri >= BASEPRI_RTQUEUES)
4017                 realtime_setrun(processor, thread);
4018         else if (thread->sched_mode == TH_MODE_FAIRSHARE)
4019                 fairshare_setrun(processor, thread);
4020         else
4021                 processor_setrun(processor, thread, options);
4022 }
4023
4024 processor_set_t
4025 task_choose_pset(
4026         task_t          task)
4027 {
4028         processor_set_t         pset = task->pset_hint;
4029
4030         if (pset != PROCESSOR_SET_NULL)
4031                 pset = choose_next_pset(pset);
4032
4033         return (pset);
4034 }
4035
4036 #if defined(CONFIG_SCHED_TRADITIONAL)
4037
4038 /*
4039  *      processor_queue_shutdown:
4040  *
4041  *      Shutdown a processor run queue by
4042  *      re-dispatching non-bound threads.
4043  *
4044  *      Associated pset must be locked, and is
4045  *      returned unlocked.
4046  */
4047 void
4048 processor_queue_shutdown(
4049         processor_t                     processor)
4050 {
4051         processor_set_t         pset = processor->processor_set;
4052         run_queue_t                     rq = runq_for_processor(processor);
4053         queue_t                         queue = rq->queues + rq->highq;
4054         int                                     pri = rq->highq, count = rq->count;
4055         thread_t                        next, thread;
4056         queue_head_t            tqueue;
4057
4058         queue_init(&tqueue);
4059
4060         while (count > 0) {
4061                 thread = (thread_t)queue_first(queue);
4062                 while (!queue_end(queue, (queue_entry_t)thread)) {
4063                         next = (thread_t)queue_next((queue_entry_t)thread);
4064
4065                         if (thread->bound_processor == PROCESSOR_NULL) {
4066                                 remqueue((queue_entry_t)thread);
4067
4068                                 thread->runq = PROCESSOR_NULL;
4069                                 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4070                                 runq_consider_decr_bound_count(processor, thread);
4071                                 rq->count--;
4072                                 if (SCHED(priority_is_urgent)(pri)) {
4073                                         rq->urgency--; assert(rq->urgency >= 0);
4074                                 }
4075                                 if (queue_empty(queue)) {
4076                                         if (pri != IDLEPRI)
4077                                                 clrbit(MAXPRI - pri, rq->bitmap);
4078                                         rq->highq = MAXPRI - ffsbit(rq->bitmap);
4079                                 }
4080
4081                                 enqueue_tail(&tqueue, (queue_entry_t)thread);
4082                         }
4083                         count--;
4084
4085                         thread = next;
4086                 }
4087
4088                 queue--; pri--;
4089         }
4090
4091         pset_unlock(pset);
4092
4093         while ((thread = (thread_t)dequeue_head(&tqueue)) != THREAD_NULL) {
4094                 thread_lock(thread);
4095
4096                 thread_setrun(thread, SCHED_TAILQ);
4097
4098                 thread_unlock(thread);
4099         }
4100 }
4101
4102 #endif /* CONFIG_SCHED_TRADITIONAL */
4103
4104 /*
4105  *      Check for a preemption point in
4106  *      the current context.
4107  *
4108  *      Called at splsched with thread locked.
4109  */
4110 ast_t
4111 csw_check(
4112         processor_t             processor,
4113         ast_t                   check_reason)
4114 {
4115         processor_set_t pset = processor->processor_set;
4116         ast_t                   result;
4117
4118         pset_lock(pset);
4119
4120         /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
4121         pset->pending_AST_cpu_mask &= ~(1U << processor->cpu_id);
4122
4123         result = csw_check_locked(processor, pset, check_reason);
4124
4125         pset_unlock(pset);
4126
4127         return result;
4128 }
4129
4130 /*
4131  * Check for preemption at splsched with
4132  * pset and thread locked
4133  */
4134 ast_t
4135 csw_check_locked(
4136         processor_t             processor,
4137         processor_set_t pset __unused,
4138         ast_t                   check_reason)
4139 {
4140         ast_t                   result;
4141         thread_t                thread = processor->active_thread;
4142
4143         if (first_timeslice(processor)) {
4144                 if (rt_runq.count > 0)
4145                         return (check_reason | AST_PREEMPT | AST_URGENT);
4146         }
4147         else {
4148                 if (rt_runq.count > 0) {
4149                         if (BASEPRI_RTQUEUES > processor->current_pri)
4150                                 return (check_reason | AST_PREEMPT | AST_URGENT);
4151                         else
4152                                 return (check_reason | AST_PREEMPT);
4153                 }
4154         }
4155
4156         result = SCHED(processor_csw_check)(processor);
4157         if (result != AST_NONE)
4158                 return (check_reason | result);
4159
4160         if (SCHED(should_current_thread_rechoose_processor)(processor))
4161                 return (check_reason | AST_PREEMPT);
4162
4163         if (thread->state & TH_SUSP)
4164                 return (check_reason | AST_PREEMPT);
4165
4166         /*
4167          * Current thread may not need to be preempted, but maybe needs
4168          * an SFI wait?
4169          */
4170         result = sfi_thread_needs_ast(thread, NULL);
4171         if (result != AST_NONE)
4172                 return (check_reason | result);
4173
4174         return (AST_NONE);
4175 }
4176
4177 /*
4178  *      set_sched_pri:
4179  *
4180  *      Set the scheduled priority of the specified thread.
4181  *
4182  *      This may cause the thread to change queues.
4183  *
4184  *      Thread must be locked.
4185  */
4186 void
4187 set_sched_pri(
4188         thread_t                thread,
4189         int                     priority)
4190 {
4191         boolean_t               removed = thread_run_queue_remove(thread);
4192         int curgency, nurgency;
4193         uint64_t urgency_param1, urgency_param2;
4194         thread_t cthread = current_thread();
4195
4196         if (thread == cthread) {
4197                 curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4198         }
4199
4200         thread->sched_pri = priority;
4201
4202         if (thread == cthread) {
4203                 nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4204 /* set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
4205  * class alterations from user space to occur relatively infrequently, hence
4206  * those are lazily handled. QoS classes have distinct priority bands, and QoS
4207  * inheritance is expected to involve priority changes.
4208  */
4209                 if (nurgency != curgency) {
4210                         thread_tell_urgency(nurgency, urgency_param1, urgency_param2, thread);
4211                 }
4212         }
4213
4214         if (removed)
4215                 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
4216         else
4217         if (thread->state & TH_RUN) {
4218                 processor_t             processor = thread->last_processor;
4219
4220                 if (thread == current_thread()) {
4221                         ast_t                   preempt;
4222
4223                         processor->current_pri = priority;
4224                         processor->current_thmode = thread->sched_mode;
4225                         processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread);
4226                         if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
4227                                 ast_on(preempt);
4228                 }
4229                 else
4230                 if (    processor != PROCESSOR_NULL                                             &&
4231                                 processor->active_thread == thread      )
4232                         cause_ast_check(processor);
4233         }
4234 }
4235
4236 #if             0
4237
4238 static void
4239 run_queue_check(
4240         run_queue_t             rq,
4241         thread_t                thread)
4242 {
4243         queue_t                 q;
4244         queue_entry_t   qe;
4245
4246         if (rq != thread->runq)
4247                 panic("run_queue_check: thread runq");
4248
4249         if (thread->sched_pri > MAXPRI || thread->sched_pri < MINPRI)
4250                 panic("run_queue_check: thread sched_pri");
4251
4252         q = &rq->queues[thread->sched_pri];
4253         qe = queue_first(q);
4254         while (!queue_end(q, qe)) {
4255                 if (qe == (queue_entry_t)thread)
4256                         return;
4257
4258                 qe = queue_next(qe);
4259         }
4260
4261         panic("run_queue_check: end");
4262 }
4263
4264 #endif  /* DEBUG */
4265
4266 #if defined(CONFIG_SCHED_TRADITIONAL)
4267
4268 /*
4269  * Locks the runqueue itself.
4270  *
4271  * Thread must be locked.
4272  */
4273 static boolean_t
4274 processor_queue_remove(
4275                                            processor_t                  processor,
4276                                            thread_t             thread)
4277 {
4278         void *                  rqlock;
4279         run_queue_t             rq;
4280
4281         rqlock = &processor->processor_set->sched_lock;
4282         rq = runq_for_processor(processor);
4283
4284         simple_lock(rqlock);
4285         if (processor == thread->runq) {
4286                 /*
4287                  *      Thread is on a run queue and we have a lock on
4288                  *      that run queue.
4289                  */
4290                 runq_consider_decr_bound_count(processor, thread);
4291                 run_queue_remove(rq, thread);
4292         }
4293         else {
4294                 /*
4295                  *      The thread left the run queue before we could
4296                  *      lock the run queue.
4297                  */
4298                 assert(thread->runq == PROCESSOR_NULL);
4299                 processor = PROCESSOR_NULL;
4300         }
4301
4302         simple_unlock(rqlock);
4303
4304         return (processor != PROCESSOR_NULL);
4305 }
4306
4307 #endif /* CONFIG_SCHED_TRADITIONAL */
4308
4309
4310 /*
4311  *      thread_run_queue_remove:
4312  *
4313  *      Remove a thread from its current run queue and
4314  *      return TRUE if successful.
4315  *
4316  *      Thread must be locked.
4317  *
4318  *      If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
4319  *      run queues because the caller locked the thread.  Otherwise
4320  *      the thread is on a run queue, but could be chosen for dispatch
4321  *      and removed by another processor under a different lock, which
4322  *      will set thread->runq to PROCESSOR_NULL.
4323  *
4324  *      Hence the thread select path must not rely on anything that could
4325  *      be changed under the thread lock after calling this function,
4326  *      most importantly thread->sched_pri.
4327  */
4328 boolean_t
4329 thread_run_queue_remove(
4330                         thread_t        thread)
4331 {
4332         boolean_t removed = FALSE;
4333         processor_t processor = thread->runq;
4334
4335         if ((thread->state & (TH_RUN|TH_WAIT)) == TH_WAIT) {
4336                 /* Thread isn't runnable */
4337                 assert(thread->runq == PROCESSOR_NULL);
4338                 return FALSE;
4339         }
4340
4341         if (processor == PROCESSOR_NULL) {
4342                 /*
4343                  * The thread is either not on the runq,
4344                  * or is in the midst of being removed from the runq.
4345                  *
4346                  * runq is set to NULL under the pset lock, not the thread
4347                  * lock, so the thread may still be in the process of being dequeued
4348                  * from the runq. It will wait in invoke for the thread lock to be
4349                  * dropped.
4350                  */
4351
4352                 return FALSE;
4353         }
4354
4355         if (thread->sched_mode == TH_MODE_FAIRSHARE) {
4356                 return SCHED(fairshare_queue_remove)(thread);
4357         }
4358
4359         if (thread->sched_pri < BASEPRI_RTQUEUES) {
4360                 return SCHED(processor_queue_remove)(processor, thread);
4361         }
4362
4363         simple_lock(&rt_lock);
4364
4365         if (thread->runq != PROCESSOR_NULL) {
4366                 /*
4367                  *      Thread is on a run queue and we have a lock on
4368                  *      that run queue.
4369                  */
4370
4371                 assert(thread->runq == RT_RUNQ);
4372
4373                 remqueue((queue_entry_t)thread);
4374                 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
4375                 rt_runq.count--;
4376
4377                 thread->runq = PROCESSOR_NULL;
4378
4379                 removed = TRUE;
4380         }
4381
4382         simple_unlock(&rt_lock);
4383
4384         return (removed);
4385 }
4386
4387 #if defined(CONFIG_SCHED_TRADITIONAL)
4388
4389 /*
4390  *      steal_processor_thread:
4391  *
4392  *      Locate a thread to steal from the processor and
4393  *      return it.
4394  *
4395  *      Associated pset must be locked.  Returns THREAD_NULL
4396  *      on failure.
4397  */
4398 static thread_t
4399 steal_processor_thread(
4400         processor_t             processor)
4401 {
4402         run_queue_t             rq = runq_for_processor(processor);
4403         queue_t                 queue = rq->queues + rq->highq;
4404         int                             pri = rq->highq, count = rq->count;
4405         thread_t                thread;
4406
4407         while (count > 0) {
4408                 thread = (thread_t)queue_first(queue);
4409                 while (!queue_end(queue, (queue_entry_t)thread)) {
4410                         if (thread->bound_processor == PROCESSOR_NULL) {
4411                                 remqueue((queue_entry_t)thread);
4412
4413                                 thread->runq = PROCESSOR_NULL;
4414                                 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4415                                 runq_consider_decr_bound_count(processor, thread);
4416                                 rq->count--;
4417                                 if (SCHED(priority_is_urgent)(pri)) {
4418                                         rq->urgency--; assert(rq->urgency >= 0);
4419                                 }
4420                                 if (queue_empty(queue)) {
4421                                         if (pri != IDLEPRI)
4422                                                 clrbit(MAXPRI - pri, rq->bitmap);
4423                                         rq->highq = MAXPRI - ffsbit(rq->bitmap);
4424                                 }
4425
4426                                 return (thread);
4427                         }
4428                         count--;
4429
4430                         thread = (thread_t)queue_next((queue_entry_t)thread);
4431                 }
4432
4433                 queue--; pri--;
4434         }
4435
4436         return (THREAD_NULL);
4437 }
4438
4439 /*
4440  *      Locate and steal a thread, beginning
4441  *      at the pset.
4442  *
4443  *      The pset must be locked, and is returned
4444  *      unlocked.
4445  *
4446  *      Returns the stolen thread, or THREAD_NULL on
4447  *      failure.
4448  */
4449 static thread_t
4450 steal_thread(
4451         processor_set_t         pset)
4452 {
4453         processor_set_t         nset, cset = pset;
4454         processor_t                     processor;
4455         thread_t                        thread;
4456
4457         do {
4458                 processor = (processor_t)queue_first(&cset->active_queue);
4459                 while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) {
4460                         if (runq_for_processor(processor)->count > 0) {
4461                                 thread = steal_processor_thread(processor);
4462                                 if (thread != THREAD_NULL) {
4463                                         remqueue((queue_entry_t)processor);
4464                                         enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
4465
4466                                         pset_unlock(cset);
4467
4468                                         return (thread);
4469                                 }
4470                         }
4471
4472                         processor = (processor_t)queue_next((queue_entry_t)processor);
4473                 }
4474
4475                 nset = next_pset(cset);
4476
4477                 if (nset != pset) {
4478                         pset_unlock(cset);
4479
4480                         cset = nset;
4481                         pset_lock(cset);
4482                 }
4483         } while (nset != pset);
4484
4485         pset_unlock(cset);
4486
4487         return (THREAD_NULL);
4488 }
4489
4490 static thread_t steal_thread_disabled(
4491                                         processor_set_t         pset)
4492 {
4493         pset_unlock(pset);
4494
4495         return (THREAD_NULL);
4496 }
4497
4498 #endif /* CONFIG_SCHED_TRADITIONAL */
4499
4500
4501 void
4502 sys_override_cpu_throttle(int flag)
4503 {
4504         if (flag == CPU_THROTTLE_ENABLE)
4505                 cpu_throttle_enabled = 1;
4506         if (flag == CPU_THROTTLE_DISABLE)
4507                 cpu_throttle_enabled = 0;
4508 }
4509
4510 int
4511 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
4512 {
4513         if (thread == NULL || (thread->state & TH_IDLE)) {
4514                 *arg1 = 0;
4515                 *arg2 = 0;
4516
4517                 return (THREAD_URGENCY_NONE);
4518         } else if (thread->sched_mode == TH_MODE_REALTIME) {
4519                 *arg1 = thread->realtime.period;
4520                 *arg2 = thread->realtime.deadline;
4521
4522                 return (THREAD_URGENCY_REAL_TIME);
4523         } else if (cpu_throttle_enabled &&
4524                    ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->priority <= MAXPRI_THROTTLE)))  {
4525                 /*
4526                  * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
4527                  * TODO: Use TH_SFLAG_THROTTLED instead?
4528                  */
4529                 *arg1 = thread->sched_pri;
4530                 *arg2 = thread->priority;
4531
4532                 return (THREAD_URGENCY_BACKGROUND);
4533         } else {
4534                 /* For otherwise unclassified threads, report throughput QoS
4535                  * parameters
4536                  */
4537                 *arg1 = thread->effective_policy.t_through_qos;
4538                 *arg2 = thread->task->effective_policy.t_through_qos;
4539
4540                 return (THREAD_URGENCY_NORMAL);
4541         }
4542 }
4543
4544
4545 /*
4546  *      This is the processor idle loop, which just looks for other threads
4547  *      to execute.  Processor idle threads invoke this without supplying a
4548  *      current thread to idle without an asserted wait state.
4549  *
4550  *      Returns a the next thread to execute if dispatched directly.
4551  */
4552
4553 #if 0
4554 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4555 #else
4556 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4557 #endif
4558
4559 thread_t
4560 processor_idle(
4561         thread_t                        thread,
4562         processor_t                     processor)
4563 {
4564         processor_set_t         pset = processor->processor_set;
4565         thread_t                        new_thread;
4566         int                                     state;
4567         (void)splsched();
4568
4569         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4570                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START,
4571                 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
4572
4573         SCHED_STATS_CPU_IDLE_START(processor);
4574
4575         timer_switch(&PROCESSOR_DATA(processor, system_state),
4576                                                                         mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
4577         PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
4578
4579         while (1) {
4580                 if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
4581                         break;
4582                 if (pset->pending_AST_cpu_mask & (1U << processor->cpu_id))
4583                         break;
4584                 if (rt_runq.count)
4585                         break;
4586 #if CONFIG_SCHED_IDLE_IN_PLACE
4587                 if (thread != THREAD_NULL) {
4588                         /* Did idle-in-place thread wake up */
4589                         if ((thread->state & (TH_WAIT|TH_SUSP)) != TH_WAIT || thread->wake_active)
4590                                 break;
4591                 }
4592 #endif
4593
4594                 IDLE_KERNEL_DEBUG_CONSTANT(
4595                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
4596
4597                 machine_track_platform_idle(TRUE);
4598
4599                 machine_idle();
4600
4601                 machine_track_platform_idle(FALSE);
4602
4603                 (void)splsched();
4604
4605                 IDLE_KERNEL_DEBUG_CONSTANT(
4606                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
4607
4608                 if (!SCHED(processor_queue_empty)(processor)) {
4609                         /* Secondary SMT processors respond to directed wakeups
4610                          * exclusively. Some platforms induce 'spurious' SMT wakeups.
4611                          */
4612                         if (processor->processor_primary == processor)
4613                                         break;
4614                 }
4615         }
4616
4617         timer_switch(&PROCESSOR_DATA(processor, idle_state),
4618                                                                         mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
4619         PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
4620
4621         pset_lock(pset);
4622
4623         /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
4624         pset->pending_AST_cpu_mask &= ~(1U << processor->cpu_id);
4625
4626         state = processor->state;
4627         if (state == PROCESSOR_DISPATCHING) {
4628                 /*
4629                  *      Commmon case -- cpu dispatched.
4630                  */
4631                 new_thread = processor->next_thread;
4632                 processor->next_thread = THREAD_NULL;
4633                 processor->state = PROCESSOR_RUNNING;
4634
4635                 if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE)                                        ||
4636                                                                                         (rt_runq.count > 0 && BASEPRI_RTQUEUES >= new_thread->sched_pri))       ) {
4637                         /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
4638                         processor->current_pri = IDLEPRI;
4639                         processor->current_thmode = TH_MODE_FIXED;
4640                         processor->current_sfi_class = SFI_CLASS_KERNEL;
4641                         processor->deadline = UINT64_MAX;
4642
4643                         pset_unlock(pset);
4644
4645                         thread_lock(new_thread);
4646                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
4647                         thread_setrun(new_thread, SCHED_HEADQ);
4648                         thread_unlock(new_thread);
4649
4650                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4651                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4652                                 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4653
4654                         return (THREAD_NULL);
4655                 }
4656
4657                 pset_unlock(pset);
4658
4659                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4660                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4661                         (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
4662
4663                 return (new_thread);
4664         }
4665         else
4666         if (state == PROCESSOR_IDLE) {
4667                 remqueue((queue_entry_t)processor);
4668
4669                 processor->state = PROCESSOR_RUNNING;
4670                 processor->current_pri = IDLEPRI;
4671                 processor->current_thmode = TH_MODE_FIXED;
4672                 processor->current_sfi_class = SFI_CLASS_KERNEL;
4673                 processor->deadline = UINT64_MAX;
4674                 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
4675         }
4676         else
4677         if (state == PROCESSOR_SHUTDOWN) {
4678                 /*
4679                  *      Going off-line.  Force a
4680                  *      reschedule.
4681                  */
4682                 if ((new_thread = processor->next_thread) != THREAD_NULL) {
4683                         processor->next_thread = THREAD_NULL;
4684                         processor->current_pri = IDLEPRI;
4685                         processor->current_thmode = TH_MODE_FIXED;
4686                         processor->current_sfi_class = SFI_CLASS_KERNEL;
4687                         processor->deadline = UINT64_MAX;
4688
4689                         pset_unlock(pset);
4690
4691                         thread_lock(new_thread);
4692                         thread_setrun(new_thread, SCHED_HEADQ);
4693                         thread_unlock(new_thread);
4694
4695                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4696                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4697                                 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4698
4699                         return (THREAD_NULL);
4700                 }
4701         }
4702
4703         pset_unlock(pset);
4704
4705         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4706                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4707                 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4708
4709         return (THREAD_NULL);
4710 }
4711
4712 /*
4713  *      Each processor has a dedicated thread which
4714  *      executes the idle loop when there is no suitable
4715  *      previous context.
4716  */
4717 void
4718 idle_thread(void)
4719 {
4720         processor_t             processor = current_processor();
4721         thread_t                new_thread;
4722
4723         new_thread = processor_idle(THREAD_NULL, processor);
4724         if (new_thread != THREAD_NULL) {
4725                 thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4726                 /*NOTREACHED*/
4727         }
4728
4729         thread_block((thread_continue_t)idle_thread);
4730         /*NOTREACHED*/
4731 }
4732
4733 kern_return_t
4734 idle_thread_create(
4735         processor_t             processor)
4736 {
4737         kern_return_t   result;
4738         thread_t                thread;
4739         spl_t                   s;
4740
4741         result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4742         if (result != KERN_SUCCESS)
4743                 return (result);
4744
4745         s = splsched();
4746         thread_lock(thread);
4747         thread->bound_processor = processor;
4748         processor->idle_thread = thread;
4749         thread->sched_pri = thread->priority = IDLEPRI;
4750         thread->state = (TH_RUN | TH_IDLE);
4751         thread->options |= TH_OPT_IDLE_THREAD;
4752         thread_unlock(thread);
4753         splx(s);
4754
4755         thread_deallocate(thread);
4756
4757         return (KERN_SUCCESS);
4758 }
4759
4760 /*
4761  * sched_startup:
4762  *
4763  * Kicks off scheduler services.
4764  *
4765  * Called at splsched.
4766  */
4767 void
4768 sched_startup(void)
4769 {
4770         kern_return_t   result;
4771         thread_t                thread;
4772
4773         result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
4774             (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
4775         if (result != KERN_SUCCESS)
4776                 panic("sched_startup");
4777
4778         thread_deallocate(thread);
4779
4780         /*
4781          * Yield to the sched_init_thread once, to
4782          * initialize our own thread after being switched
4783          * back to.
4784          *
4785          * The current thread is the only other thread
4786          * active at this point.
4787          */
4788         thread_block(THREAD_CONTINUE_NULL);
4789 }
4790
4791 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4792
4793 static volatile uint64_t                sched_maintenance_deadline;
4794 #if defined(CONFIG_TELEMETRY)
4795 static volatile uint64_t                sched_telemetry_deadline = 0;
4796 #endif
4797 static uint64_t                         sched_tick_last_abstime;
4798 static uint64_t                         sched_tick_delta;
4799 uint64_t                                sched_tick_max_delta;
4800 /*
4801  *      sched_init_thread:
4802  *
4803  *      Perform periodic bookkeeping functions about ten
4804  *      times per second.
4805  */
4806 void
4807 sched_traditional_maintenance_continue(void)
4808 {
4809         uint64_t        sched_tick_ctime, late_time;
4810
4811         sched_tick_ctime = mach_absolute_time();
4812
4813         if (__improbable(sched_tick_last_abstime == 0)) {
4814                 sched_tick_last_abstime = sched_tick_ctime;
4815                 late_time = 0;
4816                 sched_tick_delta = 1;
4817         } else {
4818                 late_time = sched_tick_ctime - sched_tick_last_abstime;
4819                 sched_tick_delta = late_time / sched_tick_interval;
4820                 /* Ensure a delta of 1, since the interval could be slightly
4821                  * smaller than the sched_tick_interval due to dispatch
4822                  * latencies.
4823                  */
4824                 sched_tick_delta = MAX(sched_tick_delta, 1);
4825
4826                 /* In the event interrupt latencies or platform
4827                  * idle events that advanced the timebase resulted
4828                  * in periods where no threads were dispatched,
4829                  * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
4830                  * iterations.
4831                  */
4832                 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
4833
4834                 sched_tick_last_abstime = sched_tick_ctime;
4835                 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
4836         }
4837
4838         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START,
4839                                                   sched_tick_delta,
4840                                                   late_time,
4841                                                   0,
4842                                                   0,
4843                                                   0);
4844
4845         /* Add a number of pseudo-ticks corresponding to the elapsed interval
4846          * This could be greater than 1 if substantial intervals where
4847          * all processors are idle occur, which rarely occurs in practice.
4848          */
4849
4850         sched_tick += sched_tick_delta;
4851
4852         /*
4853          *  Compute various averages.
4854          */
4855         compute_averages(sched_tick_delta);
4856
4857         /*
4858          *  Scan the run queues for threads which
4859          *  may need to be updated.
4860          */
4861         SCHED(thread_update_scan)();
4862
4863         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_END,
4864                                                   sched_pri_shift,
4865                                                   sched_background_pri_shift,
4866                                                   0,
4867                                                   0,
4868                                                   0);
4869
4870         assert_wait((event_t)sched_traditional_maintenance_continue, THREAD_UNINT);
4871         thread_block((thread_continue_t)sched_traditional_maintenance_continue);
4872         /*NOTREACHED*/
4873 }
4874
4875 static uint64_t sched_maintenance_wakeups;
4876
4877 /*
4878  * Determine if the set of routines formerly driven by a maintenance timer
4879  * must be invoked, based on a deadline comparison. Signals the scheduler
4880  * maintenance thread on deadline expiration. Must be invoked at an interval
4881  * lower than the "sched_tick_interval", currently accomplished by
4882  * invocation via the quantum expiration timer and at context switch time.
4883  * Performance matters: this routine reuses a timestamp approximating the
4884  * current absolute time received from the caller, and should perform
4885  * no more than a comparison against the deadline in the common case.
4886  */
4887 void
4888 sched_traditional_consider_maintenance(uint64_t ctime) {
4889         uint64_t ndeadline, deadline = sched_maintenance_deadline;
4890
4891         if (__improbable(ctime >= deadline)) {
4892                 if (__improbable(current_thread() == sched_maintenance_thread))
4893                         return;
4894                 OSMemoryBarrier();
4895
4896                 ndeadline = ctime + sched_tick_interval;
4897
4898                 if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
4899                         thread_wakeup((event_t)sched_traditional_maintenance_continue);
4900                         sched_maintenance_wakeups++;
4901                 }
4902         }
4903
4904 #if defined(CONFIG_TELEMETRY)
4905         /*
4906          * Windowed telemetry is driven by the scheduler.  It should be safe
4907          * to call compute_telemetry_windowed() even when windowed telemetry
4908          * is disabled, but we should try to avoid doing extra work for no
4909          * reason.
4910          */
4911         if (telemetry_window_enabled) {
4912                 deadline = sched_telemetry_deadline;
4913
4914                 if (__improbable(ctime >= deadline)) {
4915                         ndeadline = ctime + sched_telemetry_interval;
4916
4917                         if (__probable(__sync_bool_compare_and_swap(&sched_telemetry_deadline, deadline, ndeadline))) {
4918                                 compute_telemetry_windowed();
4919                         }
4920                 }
4921         }
4922 #endif /* CONFIG_TELEMETRY */
4923 }
4924
4925 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4926
4927 void
4928 sched_init_thread(void (*continuation)(void))
4929 {
4930         thread_block(THREAD_CONTINUE_NULL);
4931
4932         sched_maintenance_thread = current_thread();
4933         continuation();
4934
4935         /*NOTREACHED*/
4936 }
4937
4938 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4939
4940 /*
4941  *      thread_update_scan / runq_scan:
4942  *
4943  *      Scan the run queues to account for timesharing threads
4944  *      which need to be updated.
4945  *
4946  *      Scanner runs in two passes.  Pass one squirrels likely
4947  *      threads away in an array, pass two does the update.
4948  *
4949  *      This is necessary because the run queue is locked for
4950  *      the candidate scan, but the thread is locked for the update.
4951  *
4952  *      Array should be sized to make forward progress, without
4953  *      disabling preemption for long periods.
4954  */
4955
4956 #define THREAD_UPDATE_SIZE              128
4957
4958 static thread_t         thread_update_array[THREAD_UPDATE_SIZE];
4959 static int                      thread_update_count = 0;
4960
4961 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
4962 boolean_t
4963 thread_update_add_thread(thread_t thread)
4964 {
4965         if (thread_update_count == THREAD_UPDATE_SIZE)
4966                 return (FALSE);
4967
4968         thread_update_array[thread_update_count++] = thread;
4969         thread_reference_internal(thread);
4970         return (TRUE);
4971 }
4972
4973 void
4974 thread_update_process_threads(void)
4975 {
4976         while (thread_update_count > 0) {
4977                 spl_t   s;
4978                 thread_t thread = thread_update_array[--thread_update_count];
4979                 thread_update_array[thread_update_count] = THREAD_NULL;
4980
4981                 s = splsched();
4982                 thread_lock(thread);
4983                 if (!(thread->state & (TH_WAIT)) && (SCHED(can_update_priority)(thread))) {
4984                         SCHED(update_priority)(thread);
4985                 }
4986                 thread_unlock(thread);
4987                 splx(s);
4988
4989                 thread_deallocate(thread);
4990         }
4991 }
4992
4993 /*
4994  *      Scan a runq for candidate threads.
4995  *
4996  *      Returns TRUE if retry is needed.
4997  */
4998 boolean_t
4999 runq_scan(
5000         run_queue_t                             runq)
5001 {
5002         register int                    count;
5003         register queue_t                q;
5004         register thread_t               thread;
5005
5006         if ((count = runq->count) > 0) {
5007             q = runq->queues + runq->highq;
5008                 while (count > 0) {
5009                         queue_iterate(q, thread, thread_t, links) {
5010                                 if (            thread->sched_stamp != sched_tick               &&
5011                                                 (thread->sched_mode == TH_MODE_TIMESHARE)       ) {
5012                                         if (thread_update_add_thread(thread) == FALSE)
5013                                                 return (TRUE);
5014                                 }
5015
5016                                 count--;
5017                         }
5018
5019                         q--;
5020                 }
5021         }
5022
5023         return (FALSE);
5024 }
5025
5026 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
5027
5028 #if defined(CONFIG_SCHED_TRADITIONAL)
5029
5030 static void
5031 thread_update_scan(void)
5032 {
5033         boolean_t                       restart_needed = FALSE;
5034         processor_t                     processor = processor_list;
5035         processor_set_t         pset;
5036         thread_t                        thread;
5037         spl_t                           s;
5038
5039         do {
5040                 do {
5041                         /*
5042                          * TODO: in sched_traditional_use_pset_runqueue case,
5043                          *  avoid scanning the same runq multiple times
5044                          */
5045                         pset = processor->processor_set;
5046
5047                         s = splsched();
5048                         pset_lock(pset);
5049
5050                         restart_needed = runq_scan(runq_for_processor(processor));
5051
5052                         pset_unlock(pset);
5053                         splx(s);
5054
5055                         if (restart_needed)
5056                                 break;
5057
5058                         thread = processor->idle_thread;
5059                         if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
5060                                 if (thread_update_add_thread(thread) == FALSE) {
5061                                         restart_needed = TRUE;
5062                                         break;
5063                                 }
5064                         }
5065                 } while ((processor = processor->processor_list) != NULL);
5066
5067                 /* Ok, we now have a collection of candidates -- fix them. */
5068                 thread_update_process_threads();
5069         } while (restart_needed);
5070 }
5071
5072 #endif /* CONFIG_SCHED_TRADITIONAL */
5073
5074 boolean_t
5075 thread_eager_preemption(thread_t thread)
5076 {
5077         return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
5078 }
5079
5080 void
5081 thread_set_eager_preempt(thread_t thread)
5082 {
5083         spl_t x;
5084         processor_t p;
5085         ast_t ast = AST_NONE;
5086
5087         x = splsched();
5088         p = current_processor();
5089
5090         thread_lock(thread);
5091         thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
5092
5093         if (thread == current_thread()) {
5094
5095                 ast = csw_check(p, AST_NONE);
5096                 thread_unlock(thread);
5097                 if (ast != AST_NONE) {
5098                         (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
5099                 }
5100         } else {
5101                 p = thread->last_processor;
5102
5103                 if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
5104                         p->active_thread == thread) {
5105                         cause_ast_check(p);
5106                 }
5107
5108                 thread_unlock(thread);
5109         }
5110
5111         splx(x);
5112 }
5113
5114 void
5115 thread_clear_eager_preempt(thread_t thread)
5116 {
5117         spl_t x;
5118
5119         x = splsched();
5120         thread_lock(thread);
5121
5122         thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
5123
5124         thread_unlock(thread);
5125         splx(x);
5126 }
5127 /*
5128  * Scheduling statistics
5129  */
5130 void
5131 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
5132 {
5133         struct processor_sched_statistics *stats;
5134         boolean_t to_realtime = FALSE;
5135
5136         stats = &processor->processor_data.sched_stats;
5137         stats->csw_count++;
5138
5139         if (otherpri >= BASEPRI_REALTIME) {
5140                 stats->rt_sched_count++;
5141                 to_realtime = TRUE;
5142         }
5143
5144         if ((reasons & AST_PREEMPT) != 0) {
5145                 stats->preempt_count++;
5146
5147                 if (selfpri >= BASEPRI_REALTIME) {
5148                         stats->preempted_rt_count++;
5149                 }
5150
5151                 if (to_realtime) {
5152                         stats->preempted_by_rt_count++;
5153                 }
5154
5155         }
5156 }
5157
5158 void
5159 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
5160 {
5161         uint64_t timestamp = mach_absolute_time();
5162
5163         stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
5164         stats->last_change_timestamp = timestamp;
5165 }
5166
5167 /*
5168  *     For calls from assembly code
5169  */
5170 #undef thread_wakeup
5171 void
5172 thread_wakeup(
5173        event_t         x);
5174
5175 void
5176 thread_wakeup(
5177        event_t         x)
5178 {
5179        thread_wakeup_with_result(x, THREAD_AWAKENED);
5180 }
5181
5182 boolean_t
5183 preemption_enabled(void)
5184 {
5185         return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
5186 }
5187
5188 __assert_only static boolean_t
5189 thread_runnable(
5190         thread_t        thread)
5191 {
5192         return ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN);
5193 }
5194
5195 static void
5196 sched_timer_deadline_tracking_init(void) {
5197         nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
5198         nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
5199 }