osfmk/kern/sched_prim.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_FREE_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   sched_prim.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *      Date:   1986
  62  *
  63  *      Scheduling primitives
  64  *
  65  */
  66
  67 #include <debug.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/machine.h>
  71 #include <mach/policy.h>
  72 #include <mach/sync_policy.h>
  73 #include <mach/thread_act.h>
  74
  75 #include <machine/machine_routines.h>
  76 #include <machine/sched_param.h>
  77 #include <machine/machine_cpu.h>
  78 #include <machine/machlimits.h>
  79
  80 #include <kern/kern_types.h>
  81 #include <kern/clock.h>
  82 #include <kern/counters.h>
  83 #include <kern/cpu_number.h>
  84 #include <kern/cpu_data.h>
  85 #include <kern/debug.h>
  86 #include <kern/lock.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/machine.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/processor.h>
  91 #include <kern/queue.h>
  92 #include <kern/sched.h>
  93 #include <kern/sched_prim.h>
  94 #include <kern/syscall_subr.h>
  95 #include <kern/task.h>
  96 #include <kern/thread.h>
  97 #include <kern/wait_queue.h>
  98 #include <kern/ledger.h>
  99 #include <kern/timer_queue.h>
 100
 101 #include <vm/pmap.h>
 102 #include <vm/vm_kern.h>
 103 #include <vm/vm_map.h>
 104
 105 #include <mach/sdt.h>
 106
 107 #include <sys/kdebug.h>
 108
 109 #include <kern/pms.h>
 110
 111 struct rt_queue rt_runq;
 112 #define RT_RUNQ         ((processor_t)-1)
 113 decl_simple_lock_data(static,rt_lock);
 114
 115 #if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY)
 116 static struct fairshare_queue   fs_runq;
 117 #define FS_RUNQ         ((processor_t)-2)
 118 decl_simple_lock_data(static,fs_lock);
 119 #endif
 120
 121 #define         DEFAULT_PREEMPTION_RATE         100             /* (1/s) */
 122 int                     default_preemption_rate = DEFAULT_PREEMPTION_RATE;
 123
 124 #define         DEFAULT_BG_PREEMPTION_RATE      400             /* (1/s) */
 125 int                     default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
 126
 127 #define         MAX_UNSAFE_QUANTA                       800
 128 int                     max_unsafe_quanta = MAX_UNSAFE_QUANTA;
 129
 130 #define         MAX_POLL_QUANTA                         2
 131 int                     max_poll_quanta = MAX_POLL_QUANTA;
 132
 133 #define         SCHED_POLL_YIELD_SHIFT          4               /* 1/16 */
 134 int                     sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
 135
 136 uint64_t        max_poll_computation;
 137
 138 uint64_t        max_unsafe_computation;
 139 uint64_t        sched_safe_duration;
 140
 141 #if defined(CONFIG_SCHED_TRADITIONAL)
 142
 143 uint32_t        std_quantum;
 144 uint32_t        min_std_quantum;
 145 uint32_t        bg_quantum;
 146
 147 uint32_t        std_quantum_us;
 148 uint32_t        bg_quantum_us;
 149
 150 #endif /* CONFIG_SCHED_TRADITIONAL */
 151
 152 uint32_t        thread_depress_time;
 153 uint32_t        default_timeshare_computation;
 154 uint32_t        default_timeshare_constraint;
 155
 156 uint32_t        max_rt_quantum;
 157 uint32_t        min_rt_quantum;
 158
 159 #if defined(CONFIG_SCHED_TRADITIONAL)
 160
 161 unsigned        sched_tick;
 162 uint32_t        sched_tick_interval;
 163
 164 uint32_t        sched_pri_shift = INT8_MAX;
 165 uint32_t        sched_background_pri_shift = INT8_MAX;
 166 uint32_t        sched_combined_fgbg_pri_shift = INT8_MAX;
 167 uint32_t        sched_fixed_shift;
 168 uint32_t        sched_use_combined_fgbg_decay = 0;
 169
 170 uint32_t        sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
 171
 172 static boolean_t sched_traditional_use_pset_runqueue = FALSE;
 173
 174 /* Defaults for timer deadline profiling */
 175 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
 176                                                         * 2ms */
 177 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
 178                                                           <= 5ms */
 179
 180 uint64_t timer_deadline_tracking_bin_1;
 181 uint64_t timer_deadline_tracking_bin_2;
 182
 183 thread_t sched_maintenance_thread;
 184
 185 __attribute__((always_inline))
 186 static inline run_queue_t runq_for_processor(processor_t processor)
 187 {
 188         if (sched_traditional_use_pset_runqueue)
 189                 return &processor->processor_set->pset_runq;
 190         else
 191                 return &processor->runq;
 192 }
 193
 194 __attribute__((always_inline))
 195 static inline void runq_consider_incr_bound_count(processor_t processor, thread_t thread)
 196 {
 197         if (thread->bound_processor == PROCESSOR_NULL)
 198                 return;
 199
 200         assert(thread->bound_processor == processor);
 201
 202         if (sched_traditional_use_pset_runqueue)
 203                 processor->processor_set->pset_runq_bound_count++;
 204
 205         processor->runq_bound_count++;
 206 }
 207
 208 __attribute__((always_inline))
 209 static inline void runq_consider_decr_bound_count(processor_t processor, thread_t thread)
 210 {
 211         if (thread->bound_processor == PROCESSOR_NULL)
 212                 return;
 213
 214         assert(thread->bound_processor == processor);
 215
 216         if (sched_traditional_use_pset_runqueue)
 217                 processor->processor_set->pset_runq_bound_count--;
 218
 219         processor->runq_bound_count--;
 220 }
 221
 222 #endif /* CONFIG_SCHED_TRADITIONAL */
 223
 224 uint64_t        sched_one_second_interval;
 225
 226 uint32_t        sched_run_count, sched_share_count, sched_background_count;
 227 uint32_t        sched_load_average, sched_mach_factor;
 228
 229 /* Forwards */
 230
 231 #if defined(CONFIG_SCHED_TRADITIONAL)
 232
 233 static void load_shift_init(void);
 234 static void preempt_pri_init(void);
 235
 236 #endif /* CONFIG_SCHED_TRADITIONAL */
 237
 238 static thread_t thread_select(
 239                                         thread_t                        thread,
 240                                         processor_t                     processor);
 241
 242 #if CONFIG_SCHED_IDLE_IN_PLACE
 243 static thread_t thread_select_idle(
 244                                         thread_t                        thread,
 245                                         processor_t                     processor);
 246 #endif
 247
 248 thread_t        processor_idle(
 249                                         thread_t                        thread,
 250                                         processor_t                     processor);
 251
 252 ast_t
 253 csw_check_locked(       processor_t             processor,
 254                                         processor_set_t pset);
 255
 256 #if defined(CONFIG_SCHED_TRADITIONAL)
 257
 258 static thread_t steal_thread(
 259                                         processor_set_t         pset);
 260
 261 static thread_t steal_thread_disabled(
 262                                         processor_set_t         pset) __attribute__((unused));
 263
 264
 265 static thread_t steal_processor_thread(
 266                                         processor_t                     processor);
 267
 268 static void             thread_update_scan(void);
 269
 270 static void processor_setrun(
 271                                  processor_t                    processor,
 272                                  thread_t                       thread,
 273                                  integer_t                      options);
 274
 275 static boolean_t
 276 processor_enqueue(
 277                                   processor_t           processor,
 278                                   thread_t              thread,
 279                                   integer_t             options);
 280
 281 static boolean_t
 282 processor_queue_remove(
 283                                            processor_t                  processor,
 284                                            thread_t             thread);
 285
 286 static boolean_t        processor_queue_empty(processor_t               processor);
 287
 288 static boolean_t        priority_is_urgent(int priority);
 289
 290 static ast_t            processor_csw_check(processor_t processor);
 291
 292 static boolean_t        processor_queue_has_priority(processor_t                processor,
 293                                                                                         int                             priority,
 294                                                                                         boolean_t               gte);
 295
 296 static boolean_t        should_current_thread_rechoose_processor(processor_t                    processor);
 297
 298 static int     sched_traditional_processor_runq_count(processor_t   processor);
 299
 300 static boolean_t        sched_traditional_with_pset_runqueue_processor_queue_empty(processor_t          processor);
 301
 302 static uint64_t     sched_traditional_processor_runq_stats_count_sum(processor_t   processor);
 303
 304 static uint64_t         sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum(processor_t   processor);
 305 #endif
 306
 307
 308 #if defined(CONFIG_SCHED_TRADITIONAL)
 309
 310 static void
 311 sched_traditional_init(void);
 312
 313 static void
 314 sched_traditional_timebase_init(void);
 315
 316 static void
 317 sched_traditional_processor_init(processor_t processor);
 318
 319 static void
 320 sched_traditional_pset_init(processor_set_t pset);
 321
 322 static void
 323 sched_traditional_with_pset_runqueue_init(void);
 324
 325 #endif
 326
 327 static void
 328 sched_realtime_init(void);
 329
 330 static void
 331 sched_realtime_timebase_init(void);
 332
 333 static void
 334 sched_timer_deadline_tracking_init(void);
 335
 336 #if defined(CONFIG_SCHED_TRADITIONAL)
 337 static void
 338 sched_traditional_maintenance_continue(void);
 339
 340 static uint32_t
 341 sched_traditional_initial_quantum_size(thread_t thread);
 342
 343 static sched_mode_t
 344 sched_traditional_initial_thread_sched_mode(task_t parent_task);
 345
 346 static boolean_t
 347 sched_traditional_supports_timeshare_mode(void);
 348
 349 static thread_t
 350 sched_traditional_choose_thread(
 351                                                                 processor_t             processor,
 352                                                                 int                             priority);
 353
 354 #endif
 355
 356 #if     DEBUG
 357 extern int debug_task;
 358 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
 359 #else
 360 #define TLOG(a, fmt, args...) do {} while (0)
 361 #endif
 362
 363 #if     DEBUG
 364 static
 365 boolean_t       thread_runnable(
 366                                 thread_t                thread);
 367
 368 #endif  /*DEBUG*/
 369
 370 /*
 371  *      State machine
 372  *
 373  * states are combinations of:
 374  *  R   running
 375  *  W   waiting (or on wait queue)
 376  *  N   non-interruptible
 377  *  O   swapped out
 378  *  I   being swapped in
 379  *
 380  * init action
 381  *      assert_wait thread_block    clear_wait          swapout swapin
 382  *
 383  * R    RW, RWN     R;   setrun     -                   -
 384  * RN   RWN         RN;  setrun     -                   -
 385  *
 386  * RW               W               R                   -
 387  * RWN              WN              RN                  -
 388  *
 389  * W                                R;   setrun         WO
 390  * WN                               RN;  setrun         -
 391  *
 392  * RO                               -                   -       R
 393  *
 394  */
 395
 396 #if defined(CONFIG_SCHED_TRADITIONAL)
 397 int8_t          sched_load_shifts[NRQS];
 398 int             sched_preempt_pri[NRQBM];
 399 #endif
 400
 401
 402 #if defined(CONFIG_SCHED_TRADITIONAL)
 403
 404 const struct sched_dispatch_table sched_traditional_dispatch = {
 405         sched_traditional_init,
 406         sched_traditional_timebase_init,
 407         sched_traditional_processor_init,
 408         sched_traditional_pset_init,
 409         sched_traditional_maintenance_continue,
 410         sched_traditional_choose_thread,
 411         steal_thread,
 412         compute_priority,
 413         choose_processor,
 414         processor_enqueue,
 415         processor_queue_shutdown,
 416         processor_queue_remove,
 417         processor_queue_empty,
 418         priority_is_urgent,
 419         processor_csw_check,
 420         processor_queue_has_priority,
 421         sched_traditional_initial_quantum_size,
 422         sched_traditional_initial_thread_sched_mode,
 423         sched_traditional_supports_timeshare_mode,
 424         can_update_priority,
 425         update_priority,
 426         lightweight_update_priority,
 427         sched_traditional_quantum_expire,
 428         should_current_thread_rechoose_processor,
 429         sched_traditional_processor_runq_count,
 430         sched_traditional_processor_runq_stats_count_sum,
 431         sched_traditional_fairshare_init,
 432         sched_traditional_fairshare_runq_count,
 433         sched_traditional_fairshare_runq_stats_count_sum,
 434         sched_traditional_fairshare_enqueue,
 435         sched_traditional_fairshare_dequeue,
 436         sched_traditional_fairshare_queue_remove,
 437         TRUE /* direct_dispatch_to_idle_processors */
 438 };
 439
 440 const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch = {
 441         sched_traditional_with_pset_runqueue_init,
 442         sched_traditional_timebase_init,
 443         sched_traditional_processor_init,
 444         sched_traditional_pset_init,
 445         sched_traditional_maintenance_continue,
 446         sched_traditional_choose_thread,
 447         steal_thread,
 448         compute_priority,
 449         choose_processor,
 450         processor_enqueue,
 451         processor_queue_shutdown,
 452         processor_queue_remove,
 453         sched_traditional_with_pset_runqueue_processor_queue_empty,
 454         priority_is_urgent,
 455         processor_csw_check,
 456         processor_queue_has_priority,
 457         sched_traditional_initial_quantum_size,
 458         sched_traditional_initial_thread_sched_mode,
 459         sched_traditional_supports_timeshare_mode,
 460         can_update_priority,
 461         update_priority,
 462         lightweight_update_priority,
 463         sched_traditional_quantum_expire,
 464         should_current_thread_rechoose_processor,
 465         sched_traditional_processor_runq_count,
 466         sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum,
 467         sched_traditional_fairshare_init,
 468         sched_traditional_fairshare_runq_count,
 469         sched_traditional_fairshare_runq_stats_count_sum,
 470         sched_traditional_fairshare_enqueue,
 471         sched_traditional_fairshare_dequeue,
 472         sched_traditional_fairshare_queue_remove,
 473         FALSE /* direct_dispatch_to_idle_processors */
 474 };
 475
 476 #endif
 477
 478 const struct sched_dispatch_table *sched_current_dispatch = NULL;
 479
 480 /*
 481  * Statically allocate a buffer to hold the longest possible
 482  * scheduler description string, as currently implemented.
 483  * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
 484  * to export to userspace via sysctl(3). If either version
 485  * changes, update the other.
 486  *
 487  * Note that in addition to being an upper bound on the strings
 488  * in the kernel, it's also an exact parameter to PE_get_default(),
 489  * which interrogates the device tree on some platforms. That
 490  * API requires the caller know the exact size of the device tree
 491  * property, so we need both a legacy size (32) and the current size
 492  * (48) to deal with old and new device trees. The device tree property
 493  * is similarly padded to a fixed size so that the same kernel image
 494  * can run on multiple devices with different schedulers configured
 495  * in the device tree.
 496  */
 497 #define SCHED_STRING_MAX_LENGTH (48)
 498
 499 char sched_string[SCHED_STRING_MAX_LENGTH];
 500 static enum sched_enum _sched_enum __attribute__((used)) = sched_enum_unknown;
 501
 502 /* Global flag which indicates whether Background Stepper Context is enabled */
 503 static int cpu_throttle_enabled = 1;
 504
 505 void
 506 sched_init(void)
 507 {
 508         char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
 509
 510         /* Check for runtime selection of the scheduler algorithm */
 511         if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
 512                 /* If no boot-args override, look in device tree */
 513                 if (!PE_get_default("kern.sched", sched_arg,
 514                                                         SCHED_STRING_MAX_LENGTH)) {
 515                         sched_arg[0] = '\0';
 516                 }
 517         }
 518
 519         if (strlen(sched_arg) > 0) {
 520                 if (0) {
 521                         /* Allow pattern below */
 522 #if defined(CONFIG_SCHED_TRADITIONAL)
 523                 } else if (0 == strcmp(sched_arg, kSchedTraditionalString)) {
 524                         sched_current_dispatch = &sched_traditional_dispatch;
 525                         _sched_enum = sched_enum_traditional;
 526                         strlcpy(sched_string, kSchedTraditionalString, sizeof(sched_string));
 527                         kprintf("Scheduler: Runtime selection of %s\n", kSchedTraditionalString);
 528                 } else if (0 == strcmp(sched_arg, kSchedTraditionalWithPsetRunqueueString)) {
 529                         sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
 530                         _sched_enum = sched_enum_traditional_with_pset_runqueue;
 531                         strlcpy(sched_string, kSchedTraditionalWithPsetRunqueueString, sizeof(sched_string));
 532                         kprintf("Scheduler: Runtime selection of %s\n", kSchedTraditionalWithPsetRunqueueString);
 533 #endif
 534 #if defined(CONFIG_SCHED_PROTO)
 535                 } else if (0 == strcmp(sched_arg, kSchedProtoString)) {
 536                         sched_current_dispatch = &sched_proto_dispatch;
 537                         _sched_enum = sched_enum_proto;
 538                         strlcpy(sched_string, kSchedProtoString, sizeof(sched_string));
 539                         kprintf("Scheduler: Runtime selection of %s\n", kSchedProtoString);
 540 #endif
 541 #if defined(CONFIG_SCHED_GRRR)
 542                 } else if (0 == strcmp(sched_arg, kSchedGRRRString)) {
 543                         sched_current_dispatch = &sched_grrr_dispatch;
 544                         _sched_enum = sched_enum_grrr;
 545                         strlcpy(sched_string, kSchedGRRRString, sizeof(sched_string));
 546                         kprintf("Scheduler: Runtime selection of %s\n", kSchedGRRRString);
 547 #endif
 548 #if defined(CONFIG_SCHED_FIXEDPRIORITY)
 549                 } else if (0 == strcmp(sched_arg, kSchedFixedPriorityString)) {
 550                         sched_current_dispatch = &sched_fixedpriority_dispatch;
 551                         _sched_enum = sched_enum_fixedpriority;
 552                         strlcpy(sched_string, kSchedFixedPriorityString, sizeof(sched_string));
 553                         kprintf("Scheduler: Runtime selection of %s\n", kSchedFixedPriorityString);
 554                 } else if (0 == strcmp(sched_arg, kSchedFixedPriorityWithPsetRunqueueString)) {
 555                         sched_current_dispatch = &sched_fixedpriority_with_pset_runqueue_dispatch;
 556                         _sched_enum = sched_enum_fixedpriority_with_pset_runqueue;
 557                         strlcpy(sched_string, kSchedFixedPriorityWithPsetRunqueueString, sizeof(sched_string));
 558                         kprintf("Scheduler: Runtime selection of %s\n", kSchedFixedPriorityWithPsetRunqueueString);
 559 #endif
 560                 } else {
 561                         panic("Unrecognized scheduler algorithm: %s", sched_arg);
 562                 }
 563         } else {
 564 #if   defined(CONFIG_SCHED_TRADITIONAL)
 565                 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
 566                 _sched_enum = sched_enum_traditional_with_pset_runqueue;
 567                 strlcpy(sched_string, kSchedTraditionalWithPsetRunqueueString, sizeof(sched_string));
 568                 kprintf("Scheduler: Default of %s\n", kSchedTraditionalWithPsetRunqueueString);
 569 #elif defined(CONFIG_SCHED_PROTO)
 570                 sched_current_dispatch = &sched_proto_dispatch;
 571                 _sched_enum = sched_enum_proto;
 572                 strlcpy(sched_string, kSchedProtoString, sizeof(sched_string));
 573                 kprintf("Scheduler: Default of %s\n", kSchedProtoString);
 574 #elif defined(CONFIG_SCHED_GRRR)
 575                 sched_current_dispatch = &sched_grrr_dispatch;
 576                 _sched_enum = sched_enum_grrr;
 577                 strlcpy(sched_string, kSchedGRRRString, sizeof(sched_string));
 578                 kprintf("Scheduler: Default of %s\n", kSchedGRRRString);
 579 #elif defined(CONFIG_SCHED_FIXEDPRIORITY)
 580                 sched_current_dispatch = &sched_fixedpriority_dispatch;
 581                 _sched_enum = sched_enum_fixedpriority;
 582                 strlcpy(sched_string, kSchedFixedPriorityString, sizeof(sched_string));
 583                 kprintf("Scheduler: Default of %s\n", kSchedFixedPriorityString);
 584 #else
 585 #error No default scheduler implementation
 586 #endif
 587         }
 588
 589         SCHED(init)();
 590         SCHED(fairshare_init)();
 591         sched_realtime_init();
 592         ast_init();
 593         sched_timer_deadline_tracking_init();
 594
 595         SCHED(pset_init)(&pset0);
 596         SCHED(processor_init)(master_processor);
 597 }
 598
 599 void
 600 sched_timebase_init(void)
 601 {
 602         uint64_t        abstime;
 603
 604         clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
 605         sched_one_second_interval = abstime;
 606
 607         SCHED(timebase_init)();
 608         sched_realtime_timebase_init();
 609 }
 610
 611 #if defined(CONFIG_SCHED_TRADITIONAL)
 612
 613 static void
 614 sched_traditional_init(void)
 615 {
 616         /*
 617          * Calculate the timeslicing quantum
 618          * in us.
 619          */
 620         if (default_preemption_rate < 1)
 621                 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
 622         std_quantum_us = (1000 * 1000) / default_preemption_rate;
 623
 624         printf("standard timeslicing quantum is %d us\n", std_quantum_us);
 625
 626         if (default_bg_preemption_rate < 1)
 627                 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
 628         bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
 629
 630         printf("standard background quantum is %d us\n", bg_quantum_us);
 631
 632         load_shift_init();
 633         preempt_pri_init();
 634         sched_tick = 0;
 635 }
 636
 637 static void
 638 sched_traditional_timebase_init(void)
 639 {
 640         uint64_t        abstime;
 641         uint32_t        shift;
 642
 643         /* standard timeslicing quantum */
 644         clock_interval_to_absolutetime_interval(
 645                                                         std_quantum_us, NSEC_PER_USEC, &abstime);
 646         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 647         std_quantum = (uint32_t)abstime;
 648
 649         /* smallest remaining quantum (250 us) */
 650         clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
 651         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 652         min_std_quantum = (uint32_t)abstime;
 653
 654         /* quantum for background tasks */
 655         clock_interval_to_absolutetime_interval(
 656                                                         bg_quantum_us, NSEC_PER_USEC, &abstime);
 657         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 658         bg_quantum = (uint32_t)abstime;
 659
 660         /* scheduler tick interval */
 661         clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
 662                                                                                                         NSEC_PER_USEC, &abstime);
 663         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 664         sched_tick_interval = (uint32_t)abstime;
 665
 666         /*
 667          * Compute conversion factor from usage to
 668          * timesharing priorities with 5/8 ** n aging.
 669          */
 670         abstime = (abstime * 5) / 3;
 671         for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
 672                 abstime >>= 1;
 673         sched_fixed_shift = shift;
 674
 675         max_unsafe_computation = max_unsafe_quanta * std_quantum;
 676         sched_safe_duration = 2 * max_unsafe_quanta * std_quantum;
 677
 678         max_poll_computation = max_poll_quanta * std_quantum;
 679         thread_depress_time = 1 * std_quantum;
 680         default_timeshare_computation = std_quantum / 2;
 681         default_timeshare_constraint = std_quantum;
 682
 683 }
 684
 685 static void
 686 sched_traditional_processor_init(processor_t processor)
 687 {
 688         if (!sched_traditional_use_pset_runqueue) {
 689                 run_queue_init(&processor->runq);
 690         }
 691         processor->runq_bound_count = 0;
 692 }
 693
 694 static void
 695 sched_traditional_pset_init(processor_set_t pset)
 696 {
 697         if (sched_traditional_use_pset_runqueue) {
 698                 run_queue_init(&pset->pset_runq);
 699         }
 700         pset->pset_runq_bound_count = 0;
 701 }
 702
 703 static void
 704 sched_traditional_with_pset_runqueue_init(void)
 705 {
 706         sched_traditional_init();
 707         sched_traditional_use_pset_runqueue = TRUE;
 708 }
 709
 710 #endif /* CONFIG_SCHED_TRADITIONAL */
 711
 712 #if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY)
 713 void
 714 sched_traditional_fairshare_init(void)
 715 {
 716         simple_lock_init(&fs_lock, 0);
 717
 718         fs_runq.count = 0;
 719         queue_init(&fs_runq.queue);
 720 }
 721 #endif
 722
 723 static void
 724 sched_realtime_init(void)
 725 {
 726         simple_lock_init(&rt_lock, 0);
 727
 728         rt_runq.count = 0;
 729         queue_init(&rt_runq.queue);
 730 }
 731
 732 static void
 733 sched_realtime_timebase_init(void)
 734 {
 735         uint64_t abstime;
 736
 737         /* smallest rt computaton (50 us) */
 738         clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
 739         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 740         min_rt_quantum = (uint32_t)abstime;
 741
 742         /* maximum rt computation (50 ms) */
 743         clock_interval_to_absolutetime_interval(
 744                 50, 1000*NSEC_PER_USEC, &abstime);
 745         assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
 746         max_rt_quantum = (uint32_t)abstime;
 747
 748 }
 749
 750 #if defined(CONFIG_SCHED_TRADITIONAL)
 751
 752 /*
 753  * Set up values for timeshare
 754  * loading factors.
 755  */
 756 static void
 757 load_shift_init(void)
 758 {
 759         int8_t          k, *p = sched_load_shifts;
 760         uint32_t        i, j;
 761
 762         uint32_t        sched_decay_penalty = 1;
 763
 764         if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
 765                 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
 766         }
 767
 768         if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
 769                 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
 770         }
 771
 772         if (PE_parse_boot_argn("sched_use_combined_fgbg_decay", &sched_use_combined_fgbg_decay, sizeof (sched_use_combined_fgbg_decay))) {
 773                 kprintf("Overriding schedule fg/bg decay calculation: %u\n", sched_use_combined_fgbg_decay);
 774         }
 775
 776         if (sched_decay_penalty == 0) {
 777                 /*
 778                  * There is no penalty for timeshare threads for using too much
 779                  * CPU, so set all load shifts to INT8_MIN. Even under high load,
 780                  * sched_pri_shift will be >INT8_MAX, and there will be no
 781                  * penalty applied to threads (nor will sched_usage be updated per
 782                  * thread).
 783                  */
 784                 for (i = 0; i < NRQS; i++) {
 785                         sched_load_shifts[i] = INT8_MIN;
 786                 }
 787
 788                 return;
 789         }
 790
 791         *p++ = INT8_MIN; *p++ = 0;
 792
 793         /*
 794          * For a given system load "i", the per-thread priority
 795          * penalty per quantum of CPU usage is ~2^k priority
 796          * levels. "sched_decay_penalty" can cause more
 797          * array entries to be filled with smaller "k" values
 798          */
 799         for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
 800                 for (j <<= 1; (i < j) && (i < NRQS); ++i)
 801                         *p++ = k;
 802         }
 803 }
 804
 805 static void
 806 preempt_pri_init(void)
 807 {
 808         int             i, *p = sched_preempt_pri;
 809
 810         for (i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
 811                 setbit(i, p);
 812
 813         for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
 814                 setbit(i, p);
 815 }
 816
 817 #endif /* CONFIG_SCHED_TRADITIONAL */
 818
 819 /*
 820  *      Thread wait timer expiration.
 821  */
 822 void
 823 thread_timer_expire(
 824         void                    *p0,
 825         __unused void   *p1)
 826 {
 827         thread_t                thread = p0;
 828         spl_t                   s;
 829
 830         s = splsched();
 831         thread_lock(thread);
 832         if (--thread->wait_timer_active == 0) {
 833                 if (thread->wait_timer_is_set) {
 834                         thread->wait_timer_is_set = FALSE;
 835                         clear_wait_internal(thread, THREAD_TIMED_OUT);
 836                 }
 837         }
 838         thread_unlock(thread);
 839         splx(s);
 840 }
 841
 842 /*
 843  *      thread_unblock:
 844  *
 845  *      Unblock thread on wake up.
 846  *
 847  *      Returns TRUE if the thread is still running.
 848  *
 849  *      Thread must be locked.
 850  */
 851 boolean_t
 852 thread_unblock(
 853         thread_t                thread,
 854         wait_result_t   wresult)
 855 {
 856         boolean_t               result = FALSE;
 857         thread_t                cthread = current_thread();
 858
 859         /*
 860          *      Set wait_result.
 861          */
 862         thread->wait_result = wresult;
 863
 864         /*
 865          *      Cancel pending wait timer.
 866          */
 867         if (thread->wait_timer_is_set) {
 868                 if (timer_call_cancel(&thread->wait_timer))
 869                         thread->wait_timer_active--;
 870                 thread->wait_timer_is_set = FALSE;
 871         }
 872
 873         /*
 874          *      Update scheduling state: not waiting,
 875          *      set running.
 876          */
 877         thread->state &= ~(TH_WAIT|TH_UNINT);
 878
 879         if (!(thread->state & TH_RUN)) {
 880                 thread->state |= TH_RUN;
 881
 882                 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
 883
 884                 /*
 885                  *      Update run counts.
 886                  */
 887                 sched_run_incr();
 888                 if (thread->sched_mode == TH_MODE_TIMESHARE) {
 889                         sched_share_incr();
 890
 891                         if (thread->max_priority <= MAXPRI_THROTTLE)
 892                                 sched_background_incr();
 893                 }
 894         }
 895         else {
 896                 /*
 897                  *      Signal if idling on another processor.
 898                  */
 899 #if CONFIG_SCHED_IDLE_IN_PLACE
 900                 if (thread->state & TH_IDLE) {
 901                         processor_t             processor = thread->last_processor;
 902
 903                         if (processor != current_processor())
 904                                 machine_signal_idle(processor);
 905                 }
 906 #else
 907                 assert((thread->state & TH_IDLE) == 0);
 908 #endif
 909
 910                 result = TRUE;
 911         }
 912
 913         /*
 914          * Calculate deadline for real-time threads.
 915          */
 916         if (thread->sched_mode == TH_MODE_REALTIME) {
 917                 thread->realtime.deadline = thread->realtime.constraint + mach_absolute_time();
 918         }
 919
 920         /*
 921          * Clear old quantum, fail-safe computation, etc.
 922          */
 923         thread->current_quantum = 0;
 924         thread->computation_metered = 0;
 925         thread->reason = AST_NONE;
 926
 927         /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
 928          * We also account for "double hop" thread signaling via
 929          * the thread callout infrastructure.
 930          * DRK: consider removing the callout wakeup counters in the future
 931          * they're present for verification at the moment.
 932          */
 933         boolean_t aticontext, pidle;
 934         ml_get_power_state(&aticontext, &pidle);
 935
 936         if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
 937                 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
 938                 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
 939
 940                 uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
 941
 942                 if (ttd) {
 943                         if (ttd <= timer_deadline_tracking_bin_1)
 944                                 thread->thread_timer_wakeups_bin_1++;
 945                         else
 946                                 if (ttd <= timer_deadline_tracking_bin_2)
 947                                         thread->thread_timer_wakeups_bin_2++;
 948                 }
 949
 950                 if (pidle) {
 951                         ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
 952                 }
 953
 954         } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
 955                 if (cthread->callout_woken_from_icontext) {
 956                         ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
 957                         thread->thread_callout_interrupt_wakeups++;
 958                         if (cthread->callout_woken_from_platform_idle) {
 959                                 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
 960                                 thread->thread_callout_platform_idle_wakeups++;
 961                         }
 962
 963                         cthread->callout_woke_thread = TRUE;
 964                 }
 965         }
 966
 967         if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
 968                 thread->callout_woken_from_icontext = aticontext;
 969                 thread->callout_woken_from_platform_idle = pidle;
 970                 thread->callout_woke_thread = FALSE;
 971         }
 972
 973         /* Event should only be triggered if thread is not already running */
 974         if (result == FALSE) {
 975                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 976                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
 977                         (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, 0, 0);
 978         }
 979
 980         DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
 981
 982         return (result);
 983 }
 984
 985 /*
 986  *      Routine:        thread_go
 987  *      Purpose:
 988  *              Unblock and dispatch thread.
 989  *      Conditions:
 990  *              thread lock held, IPC locks may be held.
 991  *              thread must have been pulled from wait queue under same lock hold.
 992  *  Returns:
 993  *              KERN_SUCCESS - Thread was set running
 994  *              KERN_NOT_WAITING - Thread was not waiting
 995  */
 996 kern_return_t
 997 thread_go(
 998         thread_t                thread,
 999         wait_result_t   wresult)
1000 {
1001         assert(thread->at_safe_point == FALSE);
1002         assert(thread->wait_event == NO_EVENT64);
1003         assert(thread->wait_queue == WAIT_QUEUE_NULL);
1004
1005         if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT) {
1006                 if (!thread_unblock(thread, wresult))
1007                         thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1008
1009                 return (KERN_SUCCESS);
1010         }
1011
1012         return (KERN_NOT_WAITING);
1013 }
1014
1015 /*
1016  *      Routine:        thread_mark_wait_locked
1017  *      Purpose:
1018  *              Mark a thread as waiting.  If, given the circumstances,
1019  *              it doesn't want to wait (i.e. already aborted), then
1020  *              indicate that in the return value.
1021  *      Conditions:
1022  *              at splsched() and thread is locked.
1023  */
1024 __private_extern__
1025 wait_result_t
1026 thread_mark_wait_locked(
1027         thread_t                        thread,
1028         wait_interrupt_t        interruptible)
1029 {
1030         boolean_t               at_safe_point;
1031
1032         assert(thread == current_thread());
1033
1034         /*
1035          *      The thread may have certain types of interrupts/aborts masked
1036          *      off.  Even if the wait location says these types of interrupts
1037          *      are OK, we have to honor mask settings (outer-scoped code may
1038          *      not be able to handle aborts at the moment).
1039          */
1040         if (interruptible > (thread->options & TH_OPT_INTMASK))
1041                 interruptible = thread->options & TH_OPT_INTMASK;
1042
1043         at_safe_point = (interruptible == THREAD_ABORTSAFE);
1044
1045         if (    interruptible == THREAD_UNINT                   ||
1046                         !(thread->sched_flags & TH_SFLAG_ABORT) ||
1047                         (!at_safe_point &&
1048                                 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1049
1050                 if ( !(thread->state & TH_TERMINATE))
1051                         DTRACE_SCHED(sleep);
1052
1053                 thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
1054                 thread->at_safe_point = at_safe_point;
1055                 return (thread->wait_result = THREAD_WAITING);
1056         }
1057         else
1058         if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
1059                 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1060
1061         return (thread->wait_result = THREAD_INTERRUPTED);
1062 }
1063
1064 /*
1065  *      Routine:        thread_interrupt_level
1066  *      Purpose:
1067  *              Set the maximum interruptible state for the
1068  *              current thread.  The effective value of any
1069  *              interruptible flag passed into assert_wait
1070  *              will never exceed this.
1071  *
1072  *              Useful for code that must not be interrupted,
1073  *              but which calls code that doesn't know that.
1074  *      Returns:
1075  *              The old interrupt level for the thread.
1076  */
1077 __private_extern__
1078 wait_interrupt_t
1079 thread_interrupt_level(
1080         wait_interrupt_t new_level)
1081 {
1082         thread_t thread = current_thread();
1083         wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1084
1085         thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1086
1087         return result;
1088 }
1089
1090 /*
1091  * Check to see if an assert wait is possible, without actually doing one.
1092  * This is used by debug code in locks and elsewhere to verify that it is
1093  * always OK to block when trying to take a blocking lock (since waiting
1094  * for the actual assert_wait to catch the case may make it hard to detect
1095  * this case.
1096  */
1097 boolean_t
1098 assert_wait_possible(void)
1099 {
1100
1101         thread_t thread;
1102
1103 #if     DEBUG
1104         if(debug_mode) return TRUE;             /* Always succeed in debug mode */
1105 #endif
1106
1107         thread = current_thread();
1108
1109         return (thread == NULL || wait_queue_assert_possible(thread));
1110 }
1111
1112 /*
1113  *      assert_wait:
1114  *
1115  *      Assert that the current thread is about to go to
1116  *      sleep until the specified event occurs.
1117  */
1118 wait_result_t
1119 assert_wait(
1120         event_t                         event,
1121         wait_interrupt_t        interruptible)
1122 {
1123         register wait_queue_t   wq;
1124         register int            index;
1125
1126         assert(event != NO_EVENT);
1127
1128         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1129                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1130                 VM_KERNEL_UNSLIDE(event), 0, 0, 0, 0);
1131
1132         index = wait_hash(event);
1133         wq = &wait_queues[index];
1134         return wait_queue_assert_wait(wq, event, interruptible, 0);
1135 }
1136
1137 wait_result_t
1138 assert_wait_timeout(
1139         event_t                         event,
1140         wait_interrupt_t        interruptible,
1141         uint32_t                        interval,
1142         uint32_t                        scale_factor)
1143 {
1144         thread_t                        thread = current_thread();
1145         wait_result_t           wresult;
1146         wait_queue_t            wqueue;
1147         uint64_t                        deadline;
1148         spl_t                           s;
1149
1150         assert(event != NO_EVENT);
1151         wqueue = &wait_queues[wait_hash(event)];
1152
1153         s = splsched();
1154         wait_queue_lock(wqueue);
1155         thread_lock(thread);
1156
1157         clock_interval_to_deadline(interval, scale_factor, &deadline);
1158
1159         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1160                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1161                 VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1162
1163         wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t, event),
1164                                                   interruptible,
1165                                                   TIMEOUT_URGENCY_SYS_NORMAL,
1166                                                   deadline, 0,
1167                                                   thread);
1168
1169         thread_unlock(thread);
1170         wait_queue_unlock(wqueue);
1171         splx(s);
1172
1173         return (wresult);
1174 }
1175
1176 wait_result_t
1177 assert_wait_timeout_with_leeway(
1178         event_t                         event,
1179         wait_interrupt_t        interruptible,
1180         wait_timeout_urgency_t  urgency,
1181         uint32_t                        interval,
1182         uint32_t                        leeway,
1183         uint32_t                        scale_factor)
1184 {
1185         thread_t                        thread = current_thread();
1186         wait_result_t           wresult;
1187         wait_queue_t            wqueue;
1188         uint64_t                        deadline;
1189         uint64_t                        abstime;
1190         uint64_t                        slop;
1191         uint64_t                        now;
1192         spl_t                           s;
1193
1194         now = mach_absolute_time();
1195         clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1196         deadline = now + abstime;
1197
1198         clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1199
1200         assert(event != NO_EVENT);
1201         wqueue = &wait_queues[wait_hash(event)];
1202
1203         s = splsched();
1204         wait_queue_lock(wqueue);
1205         thread_lock(thread);
1206
1207         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1208                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1209                 VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1210
1211         wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t, event),
1212                                                   interruptible,
1213                                                   urgency, deadline, slop,
1214                                                   thread);
1215
1216         thread_unlock(thread);
1217         wait_queue_unlock(wqueue);
1218         splx(s);
1219
1220         return (wresult);
1221 }
1222
1223 wait_result_t
1224 assert_wait_deadline(
1225         event_t                         event,
1226         wait_interrupt_t        interruptible,
1227         uint64_t                        deadline)
1228 {
1229         thread_t                        thread = current_thread();
1230         wait_result_t           wresult;
1231         wait_queue_t            wqueue;
1232         spl_t                           s;
1233
1234         assert(event != NO_EVENT);
1235         wqueue = &wait_queues[wait_hash(event)];
1236
1237         s = splsched();
1238         wait_queue_lock(wqueue);
1239         thread_lock(thread);
1240
1241         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1242                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1243                 VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1244
1245         wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t,event),
1246                                                   interruptible,
1247                                                   TIMEOUT_URGENCY_SYS_NORMAL, deadline, 0,
1248                                                   thread);
1249
1250         thread_unlock(thread);
1251         wait_queue_unlock(wqueue);
1252         splx(s);
1253
1254         return (wresult);
1255 }
1256
1257 wait_result_t
1258 assert_wait_deadline_with_leeway(
1259         event_t                         event,
1260         wait_interrupt_t        interruptible,
1261         wait_timeout_urgency_t  urgency,
1262         uint64_t                        deadline,
1263         uint64_t                        leeway)
1264 {
1265         thread_t                        thread = current_thread();
1266         wait_result_t           wresult;
1267         wait_queue_t            wqueue;
1268         spl_t                           s;
1269
1270         assert(event != NO_EVENT);
1271         wqueue = &wait_queues[wait_hash(event)];
1272
1273         s = splsched();
1274         wait_queue_lock(wqueue);
1275         thread_lock(thread);
1276
1277         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1278                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1279                 VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1280
1281         wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t,event),
1282                                                   interruptible,
1283                                                   urgency, deadline, leeway,
1284                                                   thread);
1285
1286         thread_unlock(thread);
1287         wait_queue_unlock(wqueue);
1288         splx(s);
1289
1290         return (wresult);
1291 }
1292
1293 /*
1294  *      thread_sleep_fast_usimple_lock:
1295  *
1296  *      Cause the current thread to wait until the specified event
1297  *      occurs.  The specified simple_lock is unlocked before releasing
1298  *      the cpu and re-acquired as part of waking up.
1299  *
1300  *      This is the simple lock sleep interface for components that use a
1301  *      faster version of simple_lock() than is provided by usimple_lock().
1302  */
1303 __private_extern__ wait_result_t
1304 thread_sleep_fast_usimple_lock(
1305         event_t                 event,
1306         simple_lock_t           lock,
1307         wait_interrupt_t        interruptible)
1308 {
1309         wait_result_t res;
1310
1311         res = assert_wait(event, interruptible);
1312         if (res == THREAD_WAITING) {
1313                 simple_unlock(lock);
1314                 res = thread_block(THREAD_CONTINUE_NULL);
1315                 simple_lock(lock);
1316         }
1317         return res;
1318 }
1319
1320
1321 /*
1322  *      thread_sleep_usimple_lock:
1323  *
1324  *      Cause the current thread to wait until the specified event
1325  *      occurs.  The specified usimple_lock is unlocked before releasing
1326  *      the cpu and re-acquired as part of waking up.
1327  *
1328  *      This is the simple lock sleep interface for components where
1329  *      simple_lock() is defined in terms of usimple_lock().
1330  */
1331 wait_result_t
1332 thread_sleep_usimple_lock(
1333         event_t                 event,
1334         usimple_lock_t          lock,
1335         wait_interrupt_t        interruptible)
1336 {
1337         wait_result_t res;
1338
1339         res = assert_wait(event, interruptible);
1340         if (res == THREAD_WAITING) {
1341                 usimple_unlock(lock);
1342                 res = thread_block(THREAD_CONTINUE_NULL);
1343                 usimple_lock(lock);
1344         }
1345         return res;
1346 }
1347
1348 /*
1349  *      thread_sleep_lock_write:
1350  *
1351  *      Cause the current thread to wait until the specified event
1352  *      occurs.  The specified (write) lock is unlocked before releasing
1353  *      the cpu. The (write) lock will be re-acquired before returning.
1354  */
1355 wait_result_t
1356 thread_sleep_lock_write(
1357         event_t                 event,
1358         lock_t                  *lock,
1359         wait_interrupt_t interruptible)
1360 {
1361         wait_result_t   res;
1362
1363         res = assert_wait(event, interruptible);
1364         if (res == THREAD_WAITING) {
1365                 lock_write_done(lock);
1366                 res = thread_block(THREAD_CONTINUE_NULL);
1367                 lock_write(lock);
1368         }
1369         return res;
1370 }
1371
1372 /*
1373  * thread_isoncpu:
1374  *
1375  * Return TRUE if a thread is running on a processor such that an AST
1376  * is needed to pull it out of userspace execution, or if executing in
1377  * the kernel, bring to a context switch boundary that would cause
1378  * thread state to be serialized in the thread PCB.
1379  *
1380  * Thread locked, returns the same way. While locked, fields
1381  * like "state" and "runq" cannot change.
1382  */
1383 static inline boolean_t
1384 thread_isoncpu(thread_t thread)
1385 {
1386         /* Not running or runnable */
1387         if (!(thread->state & TH_RUN))
1388                 return (FALSE);
1389
1390         /* Waiting on a runqueue, not currently running */
1391         if (thread->runq != PROCESSOR_NULL)
1392                 return (FALSE);
1393
1394         /*
1395          * Thread must be running on a processor, or
1396          * about to run, or just did run. In all these
1397          * cases, an AST to the processor is needed
1398          * to guarantee that the thread is kicked out
1399          * of userspace and the processor has
1400          * context switched (and saved register state).
1401          */
1402         return (TRUE);
1403 }
1404
1405 /*
1406  * thread_stop:
1407  *
1408  * Force a preemption point for a thread and wait
1409  * for it to stop running on a CPU. If a stronger
1410  * guarantee is requested, wait until no longer
1411  * runnable. Arbitrates access among
1412  * multiple stop requests. (released by unstop)
1413  *
1414  * The thread must enter a wait state and stop via a
1415  * separate means.
1416  *
1417  * Returns FALSE if interrupted.
1418  */
1419 boolean_t
1420 thread_stop(
1421         thread_t                thread,
1422         boolean_t       until_not_runnable)
1423 {
1424         wait_result_t   wresult;
1425         spl_t                   s = splsched();
1426         boolean_t               oncpu;
1427
1428         wake_lock(thread);
1429         thread_lock(thread);
1430
1431         while (thread->state & TH_SUSP) {
1432                 thread->wake_active = TRUE;
1433                 thread_unlock(thread);
1434
1435                 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1436                 wake_unlock(thread);
1437                 splx(s);
1438
1439                 if (wresult == THREAD_WAITING)
1440                         wresult = thread_block(THREAD_CONTINUE_NULL);
1441
1442                 if (wresult != THREAD_AWAKENED)
1443                         return (FALSE);
1444
1445                 s = splsched();
1446                 wake_lock(thread);
1447                 thread_lock(thread);
1448         }
1449
1450         thread->state |= TH_SUSP;
1451
1452         while ((oncpu = thread_isoncpu(thread)) ||
1453                    (until_not_runnable && (thread->state & TH_RUN))) {
1454                 processor_t             processor;
1455
1456                 if (oncpu) {
1457                         assert(thread->state & TH_RUN);
1458                         processor = thread->chosen_processor;
1459                         cause_ast_check(processor);
1460                 }
1461
1462                 thread->wake_active = TRUE;
1463                 thread_unlock(thread);
1464
1465                 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1466                 wake_unlock(thread);
1467                 splx(s);
1468
1469                 if (wresult == THREAD_WAITING)
1470                         wresult = thread_block(THREAD_CONTINUE_NULL);
1471
1472                 if (wresult != THREAD_AWAKENED) {
1473                         thread_unstop(thread);
1474                         return (FALSE);
1475                 }
1476
1477                 s = splsched();
1478                 wake_lock(thread);
1479                 thread_lock(thread);
1480         }
1481
1482         thread_unlock(thread);
1483         wake_unlock(thread);
1484         splx(s);
1485
1486         /*
1487          * We return with the thread unlocked. To prevent it from
1488          * transitioning to a runnable state (or from TH_RUN to
1489          * being on the CPU), the caller must ensure the thread
1490          * is stopped via an external means (such as an AST)
1491          */
1492
1493         return (TRUE);
1494 }
1495
1496 /*
1497  * thread_unstop:
1498  *
1499  * Release a previous stop request and set
1500  * the thread running if appropriate.
1501  *
1502  * Use only after a successful stop operation.
1503  */
1504 void
1505 thread_unstop(
1506         thread_t        thread)
1507 {
1508         spl_t           s = splsched();
1509
1510         wake_lock(thread);
1511         thread_lock(thread);
1512
1513         if ((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) == TH_SUSP) {
1514                 thread->state &= ~TH_SUSP;
1515                 thread_unblock(thread, THREAD_AWAKENED);
1516
1517                 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1518         }
1519         else
1520         if (thread->state & TH_SUSP) {
1521                 thread->state &= ~TH_SUSP;
1522
1523                 if (thread->wake_active) {
1524                         thread->wake_active = FALSE;
1525                         thread_unlock(thread);
1526
1527                         thread_wakeup(&thread->wake_active);
1528                         wake_unlock(thread);
1529                         splx(s);
1530
1531                         return;
1532                 }
1533         }
1534
1535         thread_unlock(thread);
1536         wake_unlock(thread);
1537         splx(s);
1538 }
1539
1540 /*
1541  * thread_wait:
1542  *
1543  * Wait for a thread to stop running. (non-interruptible)
1544  *
1545  */
1546 void
1547 thread_wait(
1548         thread_t        thread,
1549         boolean_t       until_not_runnable)
1550 {
1551         wait_result_t   wresult;
1552         boolean_t       oncpu;
1553         processor_t     processor;
1554         spl_t           s = splsched();
1555
1556         wake_lock(thread);
1557         thread_lock(thread);
1558
1559         /*
1560          * Wait until not running on a CPU.  If stronger requirement
1561          * desired, wait until not runnable.  Assumption: if thread is
1562          * on CPU, then TH_RUN is set, so we're not waiting in any case
1563          * where the original, pure "TH_RUN" check would have let us
1564          * finish.
1565          */
1566         while ((oncpu = thread_isoncpu(thread)) ||
1567                         (until_not_runnable && (thread->state & TH_RUN))) {
1568
1569                 if (oncpu) {
1570                         assert(thread->state & TH_RUN);
1571                         processor = thread->chosen_processor;
1572                         cause_ast_check(processor);
1573                 }
1574
1575                 thread->wake_active = TRUE;
1576                 thread_unlock(thread);
1577
1578                 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1579                 wake_unlock(thread);
1580                 splx(s);
1581
1582                 if (wresult == THREAD_WAITING)
1583                         thread_block(THREAD_CONTINUE_NULL);
1584
1585                 s = splsched();
1586                 wake_lock(thread);
1587                 thread_lock(thread);
1588         }
1589
1590         thread_unlock(thread);
1591         wake_unlock(thread);
1592         splx(s);
1593 }
1594
1595 /*
1596  *      Routine: clear_wait_internal
1597  *
1598  *              Clear the wait condition for the specified thread.
1599  *              Start the thread executing if that is appropriate.
1600  *      Arguments:
1601  *              thread          thread to awaken
1602  *              result          Wakeup result the thread should see
1603  *      Conditions:
1604  *              At splsched
1605  *              the thread is locked.
1606  *      Returns:
1607  *              KERN_SUCCESS            thread was rousted out a wait
1608  *              KERN_FAILURE            thread was waiting but could not be rousted
1609  *              KERN_NOT_WAITING        thread was not waiting
1610  */
1611 __private_extern__ kern_return_t
1612 clear_wait_internal(
1613         thread_t                thread,
1614         wait_result_t   wresult)
1615 {
1616         wait_queue_t    wq = thread->wait_queue;
1617         uint32_t        i = LockTimeOut;
1618
1619         do {
1620                 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1621                         return (KERN_FAILURE);
1622
1623                 if (wq != WAIT_QUEUE_NULL) {
1624                         if (wait_queue_lock_try(wq)) {
1625                                 wait_queue_pull_thread_locked(wq, thread, TRUE);
1626                                 /* wait queue unlocked, thread still locked */
1627                         }
1628                         else {
1629                                 thread_unlock(thread);
1630                                 delay(1);
1631
1632                                 thread_lock(thread);
1633                                 if (wq != thread->wait_queue)
1634                                         return (KERN_NOT_WAITING);
1635
1636                                 continue;
1637                         }
1638                 }
1639
1640                 return (thread_go(thread, wresult));
1641         } while ((--i > 0) || machine_timeout_suspended());
1642
1643         panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
1644                   thread, wq, cpu_number());
1645
1646         return (KERN_FAILURE);
1647 }
1648
1649
1650 /*
1651  *      clear_wait:
1652  *
1653  *      Clear the wait condition for the specified thread.  Start the thread
1654  *      executing if that is appropriate.
1655  *
1656  *      parameters:
1657  *        thread                thread to awaken
1658  *        result                Wakeup result the thread should see
1659  */
1660 kern_return_t
1661 clear_wait(
1662         thread_t                thread,
1663         wait_result_t   result)
1664 {
1665         kern_return_t ret;
1666         spl_t           s;
1667
1668         s = splsched();
1669         thread_lock(thread);
1670         ret = clear_wait_internal(thread, result);
1671         thread_unlock(thread);
1672         splx(s);
1673         return ret;
1674 }
1675
1676
1677 /*
1678  *      thread_wakeup_prim:
1679  *
1680  *      Common routine for thread_wakeup, thread_wakeup_with_result,
1681  *      and thread_wakeup_one.
1682  *
1683  */
1684 kern_return_t
1685 thread_wakeup_prim(
1686         event_t                 event,
1687         boolean_t               one_thread,
1688         wait_result_t           result)
1689 {
1690         return (thread_wakeup_prim_internal(event, one_thread, result, -1));
1691 }
1692
1693
1694 kern_return_t
1695 thread_wakeup_prim_internal(
1696         event_t                 event,
1697         boolean_t               one_thread,
1698         wait_result_t           result,
1699         int                     priority)
1700 {
1701         register wait_queue_t   wq;
1702         register int                    index;
1703
1704         index = wait_hash(event);
1705         wq = &wait_queues[index];
1706         if (one_thread)
1707                 return (wait_queue_wakeup_one(wq, event, result, priority));
1708         else
1709             return (wait_queue_wakeup_all(wq, event, result));
1710 }
1711
1712 /*
1713  *      thread_bind:
1714  *
1715  *      Force the current thread to execute on the specified processor.
1716  *
1717  *      Returns the previous binding.  PROCESSOR_NULL means
1718  *      not bound.
1719  *
1720  *      XXX - DO NOT export this to users - XXX
1721  */
1722 processor_t
1723 thread_bind(
1724         processor_t             processor)
1725 {
1726         thread_t                self = current_thread();
1727         processor_t             prev;
1728         spl_t                   s;
1729
1730         s = splsched();
1731         thread_lock(self);
1732
1733         prev = self->bound_processor;
1734         self->bound_processor = processor;
1735
1736         thread_unlock(self);
1737         splx(s);
1738
1739         return (prev);
1740 }
1741
1742 /*
1743  *      thread_select:
1744  *
1745  *      Select a new thread for the current processor to execute.
1746  *
1747  *      May select the current thread, which must be locked.
1748  */
1749 static thread_t
1750 thread_select(
1751         thread_t                        thread,
1752         processor_t                     processor)
1753 {
1754         processor_set_t         pset = processor->processor_set;
1755         thread_t                        new_thread = THREAD_NULL;
1756         boolean_t                       inactive_state;
1757
1758         assert(processor == current_processor());
1759
1760         do {
1761                 /*
1762                  *      Update the priority.
1763                  */
1764                 if (SCHED(can_update_priority)(thread))
1765                         SCHED(update_priority)(thread);
1766
1767                 processor->current_pri = thread->sched_pri;
1768                 processor->current_thmode = thread->sched_mode;
1769
1770                 pset_lock(pset);
1771
1772                 assert(pset->low_count);
1773                 assert(pset->low_pri);
1774
1775                 if (processor->processor_meta != PROCESSOR_META_NULL && processor->processor_meta->primary != processor) {
1776                         /*
1777                          * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
1778                          * we should look for work only under the same conditions that choose_processor()
1779                          * would have assigned work, which is when all primary processors have been assigned work.
1780                          *
1781                          * An exception is that bound threads are dispatched to a processor without going through
1782                          * choose_processor(), so in those cases we should continue trying to dequeue work.
1783                          */
1784                         if (!processor->runq_bound_count && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
1785                                 goto idle;
1786                         }
1787                 }
1788
1789                 inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_processor_is_inactive(processor);
1790
1791                 simple_lock(&rt_lock);
1792
1793                 /*
1794                  *      Test to see if the current thread should continue
1795                  *      to run on this processor.  Must be runnable, and not
1796                  *      bound to a different processor, nor be in the wrong
1797                  *      processor set.
1798                  */
1799                 if (    ((thread->state & ~TH_SUSP) == TH_RUN)                  &&
1800                     (thread->sched_pri >= BASEPRI_RTQUEUES              ||
1801                     processor->processor_meta == PROCESSOR_META_NULL ||
1802                     processor->processor_meta->primary == processor) &&
1803                     (thread->bound_processor == PROCESSOR_NULL  ||
1804                     thread->bound_processor == processor)               &&
1805                     (thread->affinity_set == AFFINITY_SET_NULL  ||
1806                     thread->affinity_set->aset_pset == pset)) {
1807                         if (thread->sched_pri >= BASEPRI_RTQUEUES &&
1808                             first_timeslice(processor)) {
1809                                 if (rt_runq.count > 0) {
1810                                         register queue_t                q;
1811
1812                                         q = &rt_runq.queue;
1813                                         if (((thread_t)q->next)->realtime.deadline <
1814                                             processor->deadline) {
1815                                                 if ((((thread_t)q->next)->bound_processor == PROCESSOR_NULL) || (((thread_t)q->next)->bound_processor == processor)) {
1816                                                 thread = (thread_t)dequeue_head(q);
1817                                                 thread->runq = PROCESSOR_NULL;
1818                                                 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1819                                                 rt_runq.count--;
1820                                                 }
1821                                         }
1822                                 }
1823
1824                                 simple_unlock(&rt_lock);
1825
1826                                 processor->deadline = thread->realtime.deadline;
1827
1828                                 pset_unlock(pset);
1829
1830                                 return (thread);
1831                         }
1832
1833                         if (!inactive_state && (thread->sched_mode != TH_MODE_FAIRSHARE || SCHED(fairshare_runq_count)() == 0) && (rt_runq.count == 0 || BASEPRI_RTQUEUES < thread->sched_pri) &&
1834                                         (new_thread = SCHED(choose_thread)(processor, thread->sched_mode == TH_MODE_FAIRSHARE ? MINPRI : thread->sched_pri)) == THREAD_NULL) {
1835
1836                                 simple_unlock(&rt_lock);
1837
1838                                 /* I am the highest priority runnable (non-idle) thread */
1839
1840                                 pset_pri_hint(pset, processor, processor->current_pri);
1841
1842                                 pset_count_hint(pset, processor, SCHED(processor_runq_count)(processor));
1843
1844                                 processor->deadline = UINT64_MAX;
1845
1846                                 pset_unlock(pset);
1847
1848                                 return (thread);
1849                         }
1850                 }
1851
1852                 if (new_thread != THREAD_NULL ||
1853                                 (SCHED(processor_queue_has_priority)(processor, rt_runq.count == 0 ? IDLEPRI : BASEPRI_RTQUEUES, TRUE) &&
1854                                          (new_thread = SCHED(choose_thread)(processor, MINPRI)) != THREAD_NULL)) {
1855                                 simple_unlock(&rt_lock);
1856
1857                                 if (!inactive_state) {
1858                                         pset_pri_hint(pset, processor, new_thread->sched_pri);
1859
1860                                         pset_count_hint(pset, processor, SCHED(processor_runq_count)(processor));
1861                                 }
1862
1863                                 processor->deadline = UINT64_MAX;
1864                                 pset_unlock(pset);
1865
1866                                 return (new_thread);
1867                 }
1868
1869                 if (rt_runq.count > 0) {
1870                         thread = (thread_t)dequeue_head(&rt_runq.queue);
1871
1872                         if (__probable((thread->bound_processor == NULL || (thread->bound_processor == processor)))) {
1873                                 thread->runq = PROCESSOR_NULL;
1874                                 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1875                                 rt_runq.count--;
1876
1877                                 simple_unlock(&rt_lock);
1878
1879                                 processor->deadline = thread->realtime.deadline;
1880                                 pset_unlock(pset);
1881
1882                                 return (thread);
1883                         } else {
1884                                 enqueue_head(&rt_runq.queue, (queue_entry_t)thread);
1885                         }
1886                 }
1887
1888                 simple_unlock(&rt_lock);
1889
1890                 /* No realtime threads and no normal threads on the per-processor
1891                  * runqueue. Finally check for global fairshare threads.
1892                  */
1893                 if ((new_thread = SCHED(fairshare_dequeue)()) != THREAD_NULL) {
1894
1895                         processor->deadline = UINT64_MAX;
1896                         pset_unlock(pset);
1897
1898                         return (new_thread);
1899                 }
1900
1901                 processor->deadline = UINT64_MAX;
1902
1903                 /*
1904                  *      Set processor inactive based on
1905                  *      indication from the platform code.
1906                  */
1907                 if (inactive_state) {
1908                         if (processor->state == PROCESSOR_RUNNING)
1909                                 remqueue((queue_entry_t)processor);
1910                         else
1911                         if (processor->state == PROCESSOR_IDLE)
1912                                 remqueue((queue_entry_t)processor);
1913
1914                         processor->state = PROCESSOR_INACTIVE;
1915
1916                         pset_unlock(pset);
1917
1918                         return (processor->idle_thread);
1919                 }
1920
1921                 /*
1922                  *      No runnable threads, attempt to steal
1923                  *      from other processors.
1924                  */
1925                 new_thread = SCHED(steal_thread)(pset);
1926                 if (new_thread != THREAD_NULL) {
1927                         return (new_thread);
1928                 }
1929
1930                 /*
1931                  *      If other threads have appeared, shortcut
1932                  *      around again.
1933                  */
1934                 if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0 || SCHED(fairshare_runq_count)() > 0)
1935                         continue;
1936
1937                 pset_lock(pset);
1938
1939         idle:
1940                 /*
1941                  *      Nothing is runnable, so set this processor idle if it
1942                  *      was running.
1943                  */
1944                 if (processor->state == PROCESSOR_RUNNING) {
1945                         remqueue((queue_entry_t)processor);
1946                         processor->state = PROCESSOR_IDLE;
1947
1948                         if (processor->processor_meta == PROCESSOR_META_NULL || processor->processor_meta->primary == processor) {
1949                                 enqueue_head(&pset->idle_queue, (queue_entry_t)processor);
1950                                 pset_pri_init_hint(pset, processor);
1951                                 pset_count_init_hint(pset, processor);
1952                         }
1953                         else {
1954                                 enqueue_head(&processor->processor_meta->idle_queue, (queue_entry_t)processor);
1955                                         pset_unlock(pset);
1956                                         return (processor->idle_thread);
1957                         }
1958                 }
1959
1960                 pset_unlock(pset);
1961
1962 #if CONFIG_SCHED_IDLE_IN_PLACE
1963                 /*
1964                  *      Choose idle thread if fast idle is not possible.
1965                  */
1966                 if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES)
1967                         return (processor->idle_thread);
1968
1969                 /*
1970                  *      Perform idling activities directly without a
1971                  *      context switch.  Return dispatched thread,
1972                  *      else check again for a runnable thread.
1973                  */
1974                 new_thread = thread_select_idle(thread, processor);
1975
1976 #else /* !CONFIG_SCHED_IDLE_IN_PLACE */
1977
1978                 /*
1979                  * Do a full context switch to idle so that the current
1980                  * thread can start running on another processor without
1981                  * waiting for the fast-idled processor to wake up.
1982                  */
1983                 return (processor->idle_thread);
1984
1985 #endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
1986
1987         } while (new_thread == THREAD_NULL);
1988
1989         return (new_thread);
1990 }
1991
1992 #if CONFIG_SCHED_IDLE_IN_PLACE
1993 /*
1994  *      thread_select_idle:
1995  *
1996  *      Idle the processor using the current thread context.
1997  *
1998  *      Called with thread locked, then dropped and relocked.
1999  */
2000 static thread_t
2001 thread_select_idle(
2002         thread_t                thread,
2003         processor_t             processor)
2004 {
2005         thread_t                new_thread;
2006         uint64_t                arg1, arg2;
2007         int                     urgency;
2008
2009         if (thread->sched_mode == TH_MODE_TIMESHARE) {
2010                 if (thread->max_priority <= MAXPRI_THROTTLE)
2011                         sched_background_decr();
2012
2013                 sched_share_decr();
2014         }
2015         sched_run_decr();
2016
2017         thread->state |= TH_IDLE;
2018         processor->current_pri = IDLEPRI;
2019         processor->current_thmode = TH_MODE_NONE;
2020
2021         /* Reload precise timing global policy to thread-local policy */
2022         thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2023
2024         thread_unlock(thread);
2025
2026         /*
2027          *      Switch execution timing to processor idle thread.
2028          */
2029         processor->last_dispatch = mach_absolute_time();
2030         thread->last_run_time = processor->last_dispatch;
2031         thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
2032         PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
2033
2034         /*
2035          *      Cancel the quantum timer while idling.
2036          */
2037         timer_call_cancel(&processor->quantum_timer);
2038         processor->timeslice = 0;
2039
2040         (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2041
2042         thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, NULL);
2043
2044         /*
2045          *      Enable interrupts and perform idling activities.  No
2046          *      preemption due to TH_IDLE being set.
2047          */
2048         spllo(); new_thread = processor_idle(thread, processor);
2049
2050         /*
2051          *      Return at splsched.
2052          */
2053         (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
2054
2055         thread_lock(thread);
2056
2057         /*
2058          *      If awakened, switch to thread timer and start a new quantum.
2059          *      Otherwise skip; we will context switch to another thread or return here.
2060          */
2061         if (!(thread->state & TH_WAIT)) {
2062                 processor->last_dispatch = mach_absolute_time();
2063                 thread_timer_event(processor->last_dispatch, &thread->system_timer);
2064                 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2065
2066                 thread_quantum_init(thread);
2067                 thread->last_quantum_refill_time = processor->last_dispatch;
2068
2069                 processor->quantum_end = processor->last_dispatch + thread->current_quantum;
2070                 timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL);
2071                 processor->timeslice = 1;
2072
2073                 thread->computation_epoch = processor->last_dispatch;
2074         }
2075
2076         thread->state &= ~TH_IDLE;
2077
2078         /*
2079          * If we idled in place, simulate a context switch back
2080          * to the original priority of the thread so that the
2081          * platform layer cannot distinguish this from a true
2082          * switch to the idle thread.
2083          */
2084
2085         urgency = thread_get_urgency(thread, &arg1, &arg2);
2086
2087         thread_tell_urgency(urgency, arg1, arg2, new_thread);
2088
2089         sched_run_incr();
2090         if (thread->sched_mode == TH_MODE_TIMESHARE) {
2091                 sched_share_incr();
2092
2093                 if (thread->max_priority <= MAXPRI_THROTTLE)
2094                         sched_background_incr();
2095         }
2096
2097         return (new_thread);
2098 }
2099 #endif /* CONFIG_SCHED_IDLE_IN_PLACE */
2100
2101 #if defined(CONFIG_SCHED_TRADITIONAL)
2102 static thread_t
2103 sched_traditional_choose_thread(
2104                                                                 processor_t             processor,
2105                                                                 int                             priority)
2106 {
2107         thread_t thread;
2108
2109         thread = choose_thread(processor, runq_for_processor(processor), priority);
2110         if (thread != THREAD_NULL) {
2111                 runq_consider_decr_bound_count(processor, thread);
2112         }
2113
2114         return thread;
2115 }
2116
2117 #endif /* defined(CONFIG_SCHED_TRADITIONAL)  */
2118
2119 #if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_FIXEDPRIORITY)
2120
2121 /*
2122  *      choose_thread:
2123  *
2124  *      Locate a thread to execute from the processor run queue
2125  *      and return it.  Only choose a thread with greater or equal
2126  *      priority.
2127  *
2128  *      Associated pset must be locked.  Returns THREAD_NULL
2129  *      on failure.
2130  */
2131 thread_t
2132 choose_thread(
2133         processor_t             processor,
2134         run_queue_t             rq,
2135         int                             priority)
2136 {
2137         queue_t                 queue = rq->queues + rq->highq;
2138         int                             pri = rq->highq, count = rq->count;
2139         thread_t                thread;
2140
2141         while (count > 0 && pri >= priority) {
2142                 thread = (thread_t)queue_first(queue);
2143                 while (!queue_end(queue, (queue_entry_t)thread)) {
2144                         if (thread->bound_processor == PROCESSOR_NULL ||
2145                                                         thread->bound_processor == processor) {
2146                                 remqueue((queue_entry_t)thread);
2147
2148                                 thread->runq = PROCESSOR_NULL;
2149                                 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2150                                 rq->count--;
2151                                 if (SCHED(priority_is_urgent)(pri)) {
2152                                         rq->urgency--; assert(rq->urgency >= 0);
2153                                 }
2154                                 if (queue_empty(queue)) {
2155                                         if (pri != IDLEPRI)
2156                                                 clrbit(MAXPRI - pri, rq->bitmap);
2157                                         rq->highq = MAXPRI - ffsbit(rq->bitmap);
2158                                 }
2159
2160                                 return (thread);
2161                         }
2162                         count--;
2163
2164                         thread = (thread_t)queue_next((queue_entry_t)thread);
2165                 }
2166
2167                 queue--; pri--;
2168         }
2169
2170         return (THREAD_NULL);
2171 }
2172
2173 #endif /* defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_FIXEDPRIORITY) */
2174
2175 /*
2176  *      Perform a context switch and start executing the new thread.
2177  *
2178  *      Returns FALSE on failure, and the thread is re-dispatched.
2179  *
2180  *      Called at splsched.
2181  */
2182
2183 #define funnel_release_check(thread, debug)                             \
2184 MACRO_BEGIN                                                                                             \
2185         if ((thread)->funnel_state & TH_FN_OWNED) {                     \
2186                 (thread)->funnel_state = TH_FN_REFUNNEL;                \
2187                 KERNEL_DEBUG(0x603242c | DBG_FUNC_NONE,                 \
2188                         (thread)->funnel_lock, (debug), 0, 0, 0);       \
2189                 funnel_unlock((thread)->funnel_lock);                   \
2190         }                                                                                                       \
2191 MACRO_END
2192
2193 #define funnel_refunnel_check(thread, debug)                            \
2194 MACRO_BEGIN                                                                                                     \
2195         if ((thread)->funnel_state & TH_FN_REFUNNEL) {                  \
2196                 kern_return_t   result = (thread)->wait_result;         \
2197                                                                                                                         \
2198                 (thread)->funnel_state = 0;                                                     \
2199                 KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE,                         \
2200                         (thread)->funnel_lock, (debug), 0, 0, 0);               \
2201                 funnel_lock((thread)->funnel_lock);                                     \
2202                 KERNEL_DEBUG(0x6032430 | DBG_FUNC_NONE,                         \
2203                         (thread)->funnel_lock, (debug), 0, 0, 0);               \
2204                 (thread)->funnel_state = TH_FN_OWNED;                           \
2205                 (thread)->wait_result = result;                                         \
2206         }                                                                                                               \
2207 MACRO_END
2208
2209 /*
2210  * thread_invoke
2211  *
2212  * "self" is what is currently running on the processor,
2213  * "thread" is the new thread to context switch to
2214  * (which may be the same thread in some cases)
2215  */
2216 static boolean_t
2217 thread_invoke(
2218         thread_t                        self,
2219         thread_t                        thread,
2220         ast_t                           reason)
2221 {
2222         thread_continue_t       continuation = self->continuation;
2223         void                    *parameter = self->parameter;
2224         processor_t             processor;
2225         uint64_t                ctime = mach_absolute_time();
2226
2227         if (__improbable(get_preemption_level() != 0)) {
2228                 int pl = get_preemption_level();
2229                 panic("thread_invoke: preemption_level %d, possible cause: %s",
2230                     pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2231                         "blocking while holding a spinlock, or within interrupt context"));
2232         }
2233
2234         assert(self == current_thread());
2235
2236 #if defined(CONFIG_SCHED_TRADITIONAL)
2237         sched_traditional_consider_maintenance(ctime);
2238 #endif /* CONFIG_SCHED_TRADITIONAL */
2239
2240         /*
2241          * Mark thread interruptible.
2242          */
2243         thread_lock(thread);
2244         thread->state &= ~TH_UNINT;
2245
2246 #if DEBUG
2247         assert(thread_runnable(thread));
2248 #endif
2249
2250         /* Reload precise timing global policy to thread-local policy */
2251         thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2252
2253         /*
2254          * Allow time constraint threads to hang onto
2255          * a stack.
2256          */
2257         if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2258                 self->reserved_stack = self->kernel_stack;
2259
2260         if (continuation != NULL) {
2261                 if (!thread->kernel_stack) {
2262                         /*
2263                          * If we are using a privileged stack,
2264                          * check to see whether we can exchange it with
2265                          * that of the other thread.
2266                          */
2267                         if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
2268                                 goto need_stack;
2269
2270                         /*
2271                          * Context switch by performing a stack handoff.
2272                          */
2273                         continuation = thread->continuation;
2274                         parameter = thread->parameter;
2275
2276                         processor = current_processor();
2277                         processor->active_thread = thread;
2278                         processor->current_pri = thread->sched_pri;
2279                         processor->current_thmode = thread->sched_mode;
2280                         if (thread->last_processor != processor && thread->last_processor != NULL) {
2281                                 if (thread->last_processor->processor_set != processor->processor_set)
2282                                         thread->ps_switch++;
2283                                 thread->p_switch++;
2284                         }
2285                         thread->last_processor = processor;
2286                         thread->c_switch++;
2287                         ast_context(thread);
2288                         thread_unlock(thread);
2289
2290                         self->reason = reason;
2291
2292                         processor->last_dispatch = ctime;
2293                         self->last_run_time = ctime;
2294                         thread_timer_event(ctime, &thread->system_timer);
2295                         PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2296
2297                         /*
2298                          * Since non-precise user/kernel time doesn't update the state timer
2299                          * during privilege transitions, synthesize an event now.
2300                          */
2301                         if (!thread->precise_user_kernel_time) {
2302                                 timer_switch(PROCESSOR_DATA(processor, current_state),
2303                                                         ctime,
2304                                                          PROCESSOR_DATA(processor, current_state));
2305                         }
2306
2307                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2308                                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
2309                                 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2310
2311                         if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2312                                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2313                                                 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2314                         }
2315
2316                         DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2317
2318                         SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2319
2320                         TLOG(1, "thread_invoke: calling stack_handoff\n");
2321                         stack_handoff(self, thread);
2322
2323                         DTRACE_SCHED(on__cpu);
2324
2325                         thread_dispatch(self, thread);
2326
2327                         thread->continuation = thread->parameter = NULL;
2328
2329                         counter(c_thread_invoke_hits++);
2330
2331                         funnel_refunnel_check(thread, 2);
2332                         (void) spllo();
2333
2334                         assert(continuation);
2335                         call_continuation(continuation, parameter, thread->wait_result);
2336                         /*NOTREACHED*/
2337                 }
2338                 else if (thread == self) {
2339                         /* same thread but with continuation */
2340                         ast_context(self);
2341                         counter(++c_thread_invoke_same);
2342                         thread_unlock(self);
2343
2344                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2345                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2346                                 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2347
2348                         self->continuation = self->parameter = NULL;
2349
2350                         funnel_refunnel_check(self, 3);
2351                         (void) spllo();
2352
2353                         call_continuation(continuation, parameter, self->wait_result);
2354                         /*NOTREACHED*/
2355                 }
2356         }
2357         else {
2358                 /*
2359                  * Check that the other thread has a stack
2360                  */
2361                 if (!thread->kernel_stack) {
2362 need_stack:
2363                         if (!stack_alloc_try(thread)) {
2364                                 counter(c_thread_invoke_misses++);
2365                                 thread_unlock(thread);
2366                                 thread_stack_enqueue(thread);
2367                                 return (FALSE);
2368                         }
2369                 }
2370                 else if (thread == self) {
2371                         ast_context(self);
2372                         counter(++c_thread_invoke_same);
2373                         thread_unlock(self);
2374
2375                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2376                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2377                                 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2378
2379                         return (TRUE);
2380                 }
2381         }
2382
2383         /*
2384          * Context switch by full context save.
2385          */
2386         processor = current_processor();
2387         processor->active_thread = thread;
2388         processor->current_pri = thread->sched_pri;
2389         processor->current_thmode = thread->sched_mode;
2390         if (thread->last_processor != processor && thread->last_processor != NULL) {
2391                 if (thread->last_processor->processor_set != processor->processor_set)
2392                         thread->ps_switch++;
2393                 thread->p_switch++;
2394         }
2395         thread->last_processor = processor;
2396         thread->c_switch++;
2397         ast_context(thread);
2398         thread_unlock(thread);
2399
2400         counter(c_thread_invoke_csw++);
2401
2402         assert(self->runq == PROCESSOR_NULL);
2403         self->reason = reason;
2404
2405         processor->last_dispatch = ctime;
2406         self->last_run_time = ctime;
2407         thread_timer_event(ctime, &thread->system_timer);
2408         PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2409
2410         /*
2411          * Since non-precise user/kernel time doesn't update the state timer
2412          * during privilege transitions, synthesize an event now.
2413          */
2414         if (!thread->precise_user_kernel_time) {
2415                 timer_switch(PROCESSOR_DATA(processor, current_state),
2416                                         ctime,
2417                                          PROCESSOR_DATA(processor, current_state));
2418         }
2419
2420
2421         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2422                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2423                 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2424
2425         if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
2426                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2427                                 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2428         }
2429
2430         DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2431
2432         SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2433
2434         /*
2435          * This is where we actually switch register context,
2436          * and address space if required.  We will next run
2437          * as a result of a subsequent context switch.
2438          */
2439         assert(continuation == self->continuation);
2440         thread = machine_switch_context(self, continuation, thread);
2441         assert(self == current_thread());
2442         TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2443
2444         DTRACE_SCHED(on__cpu);
2445
2446         /*
2447          * We have been resumed and are set to run.
2448          */
2449         thread_dispatch(thread, self);
2450
2451         if (continuation) {
2452                 self->continuation = self->parameter = NULL;
2453
2454                 funnel_refunnel_check(self, 3);
2455                 (void) spllo();
2456
2457                 call_continuation(continuation, parameter, self->wait_result);
2458                 /*NOTREACHED*/
2459         }
2460
2461         return (TRUE);
2462 }
2463
2464 /*
2465  *      thread_dispatch:
2466  *
2467  *      Handle threads at context switch.  Re-dispatch other thread
2468  *      if still running, otherwise update run state and perform
2469  *      special actions.  Update quantum for other thread and begin
2470  *      the quantum for ourselves.
2471  *
2472  *     "self" is our new current thread that we have context switched
2473  *     to, "thread" is the old thread that we have switched away from.
2474  *
2475  *      Called at splsched.
2476  */
2477 void
2478 thread_dispatch(
2479         thread_t                thread,
2480         thread_t                self)
2481 {
2482         processor_t             processor = self->last_processor;
2483
2484         if (thread != THREAD_NULL) {
2485                 /*
2486                  *      If blocked at a continuation, discard
2487                  *      the stack.
2488                  */
2489                 if (thread->continuation != NULL && thread->kernel_stack != 0)
2490                         stack_free(thread);
2491
2492                 if (!(thread->state & TH_IDLE)) {
2493                         int64_t consumed;
2494                         int64_t remainder = 0;
2495
2496                         if (processor->quantum_end > processor->last_dispatch)
2497                                 remainder = processor->quantum_end -
2498                                     processor->last_dispatch;
2499
2500                         consumed = thread->current_quantum - remainder;
2501
2502                         if ((thread->reason & AST_LEDGER) == 0) {
2503                                 /*
2504                                  * Bill CPU time to both the task and
2505                                  * the individual thread.
2506                                  */
2507                                 ledger_credit(thread->t_ledger,
2508                                     task_ledgers.cpu_time, consumed);
2509                                 ledger_credit(thread->t_threadledger,
2510                                     thread_ledgers.cpu_time, consumed);
2511                         }
2512
2513                         wake_lock(thread);
2514                         thread_lock(thread);
2515
2516                         /*
2517                          *      Compute remainder of current quantum.
2518                          */
2519                         if (first_timeslice(processor) &&
2520                             processor->quantum_end > processor->last_dispatch)
2521                                 thread->current_quantum = (uint32_t)remainder;
2522                         else
2523                                 thread->current_quantum = 0;
2524
2525                         if (thread->sched_mode == TH_MODE_REALTIME) {
2526                                 /*
2527                                  *      Cancel the deadline if the thread has
2528                                  *      consumed the entire quantum.
2529                                  */
2530                                 if (thread->current_quantum == 0) {
2531                                         thread->realtime.deadline = UINT64_MAX;
2532                                         thread->reason |= AST_QUANTUM;
2533                                 }
2534                         } else {
2535 #if defined(CONFIG_SCHED_TRADITIONAL)
2536                                 /*
2537                                  *      For non-realtime threads treat a tiny
2538                                  *      remaining quantum as an expired quantum
2539                                  *      but include what's left next time.
2540                                  */
2541                                 if (thread->current_quantum < min_std_quantum) {
2542                                         thread->reason |= AST_QUANTUM;
2543                                         thread->current_quantum += SCHED(initial_quantum_size)(thread);
2544                                 }
2545 #endif
2546                         }
2547
2548                         /*
2549                          *      If we are doing a direct handoff then
2550                          *      take the remainder of the quantum.
2551                          */
2552                         if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) {
2553                                 self->current_quantum = thread->current_quantum;
2554                                 thread->reason |= AST_QUANTUM;
2555                                 thread->current_quantum = 0;
2556                         }
2557
2558                         thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2559
2560                         if ((thread->rwlock_count != 0) && !(LcksOpts & disLkRWPrio)) {
2561                                 integer_t priority;
2562
2563                                 priority = thread->sched_pri;
2564
2565                                 if (priority < thread->priority)
2566                                         priority = thread->priority;
2567                                 if (priority < BASEPRI_BACKGROUND)
2568                                         priority = BASEPRI_BACKGROUND;
2569
2570                                 if ((thread->sched_pri < priority) || !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2571                                         KERNEL_DEBUG_CONSTANT(
2572                                                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE,
2573                                                 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->priority, priority, 0);
2574
2575                                         thread->sched_flags |= TH_SFLAG_RW_PROMOTED;
2576
2577                                         if (thread->sched_pri < priority)
2578                                                 set_sched_pri(thread, priority);
2579                                 }
2580                         }
2581
2582                         if (!(thread->state & TH_WAIT)) {
2583                                 /*
2584                                  *      Still running.
2585                                  */
2586                                 if (thread->reason & AST_QUANTUM)
2587                                         thread_setrun(thread, SCHED_TAILQ);
2588                                 else
2589                                 if (thread->reason & AST_PREEMPT)
2590                                         thread_setrun(thread, SCHED_HEADQ);
2591                                 else
2592                                         thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
2593
2594                                 thread->reason = AST_NONE;
2595
2596                                 if (thread->wake_active) {
2597                                         thread->wake_active = FALSE;
2598                                         thread_unlock(thread);
2599
2600                                         thread_wakeup(&thread->wake_active);
2601                                 }
2602                                 else
2603                                         thread_unlock(thread);
2604
2605                                 wake_unlock(thread);
2606                         }
2607                         else {
2608                                 /*
2609                                  *      Waiting.
2610                                  */
2611                                 boolean_t should_terminate = FALSE;
2612
2613                                 /* Only the first call to thread_dispatch
2614                                  * after explicit termination should add
2615                                  * the thread to the termination queue
2616                                  */
2617                                 if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
2618                                         should_terminate = TRUE;
2619                                         thread->state |= TH_TERMINATE2;
2620                                 }
2621
2622                                 thread->state &= ~TH_RUN;
2623                                 thread->chosen_processor = PROCESSOR_NULL;
2624
2625                                 if (thread->sched_mode == TH_MODE_TIMESHARE) {
2626                                         if (thread->max_priority <= MAXPRI_THROTTLE)
2627                                                 sched_background_decr();
2628
2629                                         sched_share_decr();
2630                                 }
2631                                 sched_run_decr();
2632
2633                                 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2634
2635                                 if (thread->wake_active) {
2636                                         thread->wake_active = FALSE;
2637                                         thread_unlock(thread);
2638
2639                                         thread_wakeup(&thread->wake_active);
2640                                 }
2641                                 else
2642                                         thread_unlock(thread);
2643
2644                                 wake_unlock(thread);
2645
2646                                 if (should_terminate)
2647                                         thread_terminate_enqueue(thread);
2648                         }
2649                 }
2650         }
2651
2652         if (!(self->state & TH_IDLE)) {
2653                 uint64_t        arg1, arg2;
2654                 int             urgency;
2655
2656                 urgency = thread_get_urgency(self, &arg1, &arg2);
2657
2658                 thread_tell_urgency(urgency, arg1, arg2, self);
2659
2660                 /*
2661                  *      Get a new quantum if none remaining.
2662                  */
2663                 if (self->current_quantum == 0) {
2664                         thread_quantum_init(self);
2665                         self->last_quantum_refill_time = processor->last_dispatch;
2666                 }
2667
2668                 /*
2669                  *      Set up quantum timer and timeslice.
2670                  */
2671                 processor->quantum_end = (processor->last_dispatch + self->current_quantum);
2672                 timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL);
2673
2674                 processor->timeslice = 1;
2675
2676                 self->computation_epoch = processor->last_dispatch;
2677         }
2678         else {
2679                 timer_call_cancel(&processor->quantum_timer);
2680                 processor->timeslice = 0;
2681
2682                 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, NULL);
2683         }
2684 }
2685
2686 #include <libkern/OSDebug.h>
2687
2688 uint32_t        kdebug_thread_block = 0;
2689
2690
2691 /*
2692  *      thread_block_reason:
2693  *
2694  *      Forces a reschedule, blocking the caller if a wait
2695  *      has been asserted.
2696  *
2697  *      If a continuation is specified, then thread_invoke will
2698  *      attempt to discard the thread's kernel stack.  When the
2699  *      thread resumes, it will execute the continuation function
2700  *      on a new kernel stack.
2701  */
2702 counter(mach_counter_t  c_thread_block_calls = 0;)
2703
2704 wait_result_t
2705 thread_block_reason(
2706         thread_continue_t       continuation,
2707         void                            *parameter,
2708         ast_t                           reason)
2709 {
2710         register thread_t               self = current_thread();
2711         register processor_t    processor;
2712         register thread_t               new_thread;
2713         spl_t                                   s;
2714
2715         counter(++c_thread_block_calls);
2716
2717         s = splsched();
2718
2719         if (!(reason & AST_PREEMPT))
2720                 funnel_release_check(self, 2);
2721
2722         processor = current_processor();
2723
2724         /* If we're explicitly yielding, force a subsequent quantum */
2725         if (reason & AST_YIELD)
2726                 processor->timeslice = 0;
2727
2728         /* We're handling all scheduling AST's */
2729         ast_off(AST_SCHEDULING);
2730
2731         self->continuation = continuation;
2732         self->parameter = parameter;
2733
2734         if (__improbable(kdebug_thread_block && kdebug_enable && self->state != TH_RUN)) {
2735                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2736                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
2737                         reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
2738         }
2739
2740         do {
2741                 thread_lock(self);
2742                 new_thread = thread_select(self, processor);
2743                 thread_unlock(self);
2744         } while (!thread_invoke(self, new_thread, reason));
2745
2746         funnel_refunnel_check(self, 5);
2747         splx(s);
2748
2749         return (self->wait_result);
2750 }
2751
2752 /*
2753  *      thread_block:
2754  *
2755  *      Block the current thread if a wait has been asserted.
2756  */
2757 wait_result_t
2758 thread_block(
2759         thread_continue_t       continuation)
2760 {
2761         return thread_block_reason(continuation, NULL, AST_NONE);
2762 }
2763
2764 wait_result_t
2765 thread_block_parameter(
2766         thread_continue_t       continuation,
2767         void                            *parameter)
2768 {
2769         return thread_block_reason(continuation, parameter, AST_NONE);
2770 }
2771
2772 /*
2773  *      thread_run:
2774  *
2775  *      Switch directly from the current thread to the
2776  *      new thread, handing off our quantum if appropriate.
2777  *
2778  *      New thread must be runnable, and not on a run queue.
2779  *
2780  *      Called at splsched.
2781  */
2782 int
2783 thread_run(
2784         thread_t                        self,
2785         thread_continue_t       continuation,
2786         void                            *parameter,
2787         thread_t                        new_thread)
2788 {
2789         ast_t           handoff = AST_HANDOFF;
2790
2791         funnel_release_check(self, 3);
2792
2793         self->continuation = continuation;
2794         self->parameter = parameter;
2795
2796         while (!thread_invoke(self, new_thread, handoff)) {
2797                 processor_t             processor = current_processor();
2798
2799                 thread_lock(self);
2800                 new_thread = thread_select(self, processor);
2801                 thread_unlock(self);
2802                 handoff = AST_NONE;
2803         }
2804
2805         funnel_refunnel_check(self, 6);
2806
2807         return (self->wait_result);
2808 }
2809
2810 /*
2811  *      thread_continue:
2812  *
2813  *      Called at splsched when a thread first receives
2814  *      a new stack after a continuation.
2815  */
2816 void
2817 thread_continue(
2818         register thread_t       thread)
2819 {
2820         register thread_t               self = current_thread();
2821         register thread_continue_t      continuation;
2822         register void                   *parameter;
2823
2824         DTRACE_SCHED(on__cpu);
2825
2826         continuation = self->continuation;
2827         parameter = self->parameter;
2828
2829         thread_dispatch(thread, self);
2830
2831         self->continuation = self->parameter = NULL;
2832
2833         funnel_refunnel_check(self, 4);
2834
2835         if (thread != THREAD_NULL)
2836                 (void)spllo();
2837
2838  TLOG(1, "thread_continue: calling call_continuation \n");
2839         call_continuation(continuation, parameter, self->wait_result);
2840         /*NOTREACHED*/
2841 }
2842
2843 void
2844 thread_quantum_init(thread_t thread)
2845 {
2846         if (thread->sched_mode == TH_MODE_REALTIME) {
2847                 thread->current_quantum = thread->realtime.computation;
2848         } else {
2849                 thread->current_quantum = SCHED(initial_quantum_size)(thread);
2850         }
2851 }
2852
2853 #if defined(CONFIG_SCHED_TRADITIONAL)
2854 static uint32_t
2855 sched_traditional_initial_quantum_size(thread_t thread)
2856 {
2857         if ((thread == THREAD_NULL) || thread->priority > MAXPRI_THROTTLE)
2858                 return std_quantum;
2859         else
2860                 return bg_quantum;
2861 }
2862
2863 static sched_mode_t
2864 sched_traditional_initial_thread_sched_mode(task_t parent_task)
2865 {
2866         if (parent_task == kernel_task)
2867                 return TH_MODE_FIXED;
2868         else
2869                 return TH_MODE_TIMESHARE;
2870 }
2871
2872 static boolean_t
2873 sched_traditional_supports_timeshare_mode(void)
2874 {
2875         return TRUE;
2876 }
2877
2878 #endif /* CONFIG_SCHED_TRADITIONAL */
2879
2880 /*
2881  *      run_queue_init:
2882  *
2883  *      Initialize a run queue before first use.
2884  */
2885 void
2886 run_queue_init(
2887         run_queue_t             rq)
2888 {
2889         int                             i;
2890
2891         rq->highq = IDLEPRI;
2892         for (i = 0; i < NRQBM; i++)
2893                 rq->bitmap[i] = 0;
2894         setbit(MAXPRI - IDLEPRI, rq->bitmap);
2895         rq->urgency = rq->count = 0;
2896         for (i = 0; i < NRQS; i++)
2897                 queue_init(&rq->queues[i]);
2898 }
2899
2900 #if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY)
2901 int
2902 sched_traditional_fairshare_runq_count(void)
2903 {
2904         return fs_runq.count;
2905 }
2906
2907 uint64_t
2908 sched_traditional_fairshare_runq_stats_count_sum(void)
2909 {
2910         return fs_runq.runq_stats.count_sum;
2911 }
2912
2913 void
2914 sched_traditional_fairshare_enqueue(thread_t thread)
2915 {
2916         queue_t                         queue = &fs_runq.queue;
2917
2918         simple_lock(&fs_lock);
2919
2920         enqueue_tail(queue, (queue_entry_t)thread);
2921
2922         thread->runq = FS_RUNQ;
2923         SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count);
2924         fs_runq.count++;
2925
2926         simple_unlock(&fs_lock);
2927 }
2928
2929 thread_t
2930 sched_traditional_fairshare_dequeue(void)
2931 {
2932         thread_t thread;
2933
2934         simple_lock(&fs_lock);
2935         if (fs_runq.count > 0) {
2936                 thread = (thread_t)dequeue_head(&fs_runq.queue);
2937
2938                 thread->runq = PROCESSOR_NULL;
2939                 SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count);
2940                 fs_runq.count--;
2941
2942                 simple_unlock(&fs_lock);
2943
2944                 return (thread);
2945         }
2946         simple_unlock(&fs_lock);
2947
2948         return THREAD_NULL;
2949 }
2950
2951 boolean_t
2952 sched_traditional_fairshare_queue_remove(thread_t thread)
2953 {
2954         queue_t                 q;
2955
2956         simple_lock(&fs_lock);
2957         q = &fs_runq.queue;
2958
2959         if (FS_RUNQ == thread->runq) {
2960                 remqueue((queue_entry_t)thread);
2961                 SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count);
2962                 fs_runq.count--;
2963
2964                 thread->runq = PROCESSOR_NULL;
2965                 simple_unlock(&fs_lock);
2966                 return (TRUE);
2967         }
2968         else {
2969                 /*
2970                  *      The thread left the run queue before we could
2971                  *      lock the run queue.
2972                  */
2973                 assert(thread->runq == PROCESSOR_NULL);
2974                 simple_unlock(&fs_lock);
2975                 return (FALSE);
2976         }
2977 }
2978
2979 #endif /* defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY) */
2980
2981 /*
2982  *      run_queue_dequeue:
2983  *
2984  *      Perform a dequeue operation on a run queue,
2985  *      and return the resulting thread.
2986  *
2987  *      The run queue must be locked (see thread_run_queue_remove()
2988  *      for more info), and not empty.
2989  */
2990 thread_t
2991 run_queue_dequeue(
2992         run_queue_t             rq,
2993         integer_t               options)
2994 {
2995         thread_t                thread;
2996         queue_t                 queue = rq->queues + rq->highq;
2997
2998         if (options & SCHED_HEADQ) {
2999                 thread = (thread_t)dequeue_head(queue);
3000         }
3001         else {
3002                 thread = (thread_t)dequeue_tail(queue);
3003         }
3004
3005         thread->runq = PROCESSOR_NULL;
3006         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3007         rq->count--;
3008         if (SCHED(priority_is_urgent)(rq->highq)) {
3009                 rq->urgency--; assert(rq->urgency >= 0);
3010         }
3011         if (queue_empty(queue)) {
3012                 if (rq->highq != IDLEPRI)
3013                         clrbit(MAXPRI - rq->highq, rq->bitmap);
3014                 rq->highq = MAXPRI - ffsbit(rq->bitmap);
3015         }
3016
3017         return (thread);
3018 }
3019
3020 /*
3021  *      run_queue_enqueue:
3022  *
3023  *      Perform a enqueue operation on a run queue.
3024  *
3025  *      The run queue must be locked (see thread_run_queue_remove()
3026  *      for more info).
3027  */
3028 boolean_t
3029 run_queue_enqueue(
3030                                                           run_queue_t           rq,
3031                                                           thread_t                      thread,
3032                                                           integer_t             options)
3033 {
3034         queue_t                 queue = rq->queues + thread->sched_pri;
3035         boolean_t               result = FALSE;
3036
3037         if (queue_empty(queue)) {
3038                 enqueue_tail(queue, (queue_entry_t)thread);
3039
3040                 setbit(MAXPRI - thread->sched_pri, rq->bitmap);
3041                 if (thread->sched_pri > rq->highq) {
3042                         rq->highq = thread->sched_pri;
3043                         result = TRUE;
3044                 }
3045         }
3046         else
3047                 if (options & SCHED_TAILQ)
3048                         enqueue_tail(queue, (queue_entry_t)thread);
3049                 else
3050                         enqueue_head(queue, (queue_entry_t)thread);
3051
3052         if (SCHED(priority_is_urgent)(thread->sched_pri))
3053                 rq->urgency++;
3054         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3055         rq->count++;
3056
3057         return (result);
3058
3059 }
3060
3061 /*
3062  *      run_queue_remove:
3063  *
3064  *      Remove a specific thread from a runqueue.
3065  *
3066  *      The run queue must be locked.
3067  */
3068 void
3069 run_queue_remove(
3070                                   run_queue_t           rq,
3071                                   thread_t                      thread)
3072 {
3073
3074         remqueue((queue_entry_t)thread);
3075         SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3076         rq->count--;
3077         if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3078                 rq->urgency--; assert(rq->urgency >= 0);
3079         }
3080
3081         if (queue_empty(rq->queues + thread->sched_pri)) {
3082                 /* update run queue status */
3083                 if (thread->sched_pri != IDLEPRI)
3084                         clrbit(MAXPRI - thread->sched_pri, rq->bitmap);
3085                 rq->highq = MAXPRI - ffsbit(rq->bitmap);
3086         }
3087
3088         thread->runq = PROCESSOR_NULL;
3089 }
3090
3091 /*
3092  *      fairshare_setrun:
3093  *
3094  *      Dispatch a thread for round-robin execution.
3095  *
3096  *      Thread must be locked.  Associated pset must
3097  *      be locked, and is returned unlocked.
3098  */
3099 static void
3100 fairshare_setrun(
3101                                   processor_t                   processor,
3102                                   thread_t                      thread)
3103 {
3104         processor_set_t         pset = processor->processor_set;
3105
3106         thread->chosen_processor = processor;
3107
3108         SCHED(fairshare_enqueue)(thread);
3109
3110         if (processor != current_processor())
3111                 machine_signal_idle(processor);
3112
3113         pset_unlock(pset);
3114
3115 }
3116
3117 /*
3118  *      realtime_queue_insert:
3119  *
3120  *      Enqueue a thread for realtime execution.
3121  */
3122 static boolean_t
3123 realtime_queue_insert(
3124         thread_t                        thread)
3125 {
3126         queue_t                         queue = &rt_runq.queue;
3127         uint64_t                        deadline = thread->realtime.deadline;
3128         boolean_t                       preempt = FALSE;
3129
3130         simple_lock(&rt_lock);
3131
3132         if (queue_empty(queue)) {
3133                 enqueue_tail(queue, (queue_entry_t)thread);
3134                 preempt = TRUE;
3135         }
3136         else {
3137                 register thread_t       entry = (thread_t)queue_first(queue);
3138
3139                 while (TRUE) {
3140                         if (    queue_end(queue, (queue_entry_t)entry)  ||
3141                                                 deadline < entry->realtime.deadline             ) {
3142                                 entry = (thread_t)queue_prev((queue_entry_t)entry);
3143                                 break;
3144                         }
3145
3146                         entry = (thread_t)queue_next((queue_entry_t)entry);
3147                 }
3148
3149                 if ((queue_entry_t)entry == queue)
3150                         preempt = TRUE;
3151
3152                 insque((queue_entry_t)thread, (queue_entry_t)entry);
3153         }
3154
3155         thread->runq = RT_RUNQ;
3156         SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
3157         rt_runq.count++;
3158
3159         simple_unlock(&rt_lock);
3160
3161         return (preempt);
3162 }
3163
3164 /*
3165  *      realtime_setrun:
3166  *
3167  *      Dispatch a thread for realtime execution.
3168  *
3169  *      Thread must be locked.  Associated pset must
3170  *      be locked, and is returned unlocked.
3171  */
3172 static void
3173 realtime_setrun(
3174         processor_t                     processor,
3175         thread_t                        thread)
3176 {
3177         processor_set_t         pset = processor->processor_set;
3178         ast_t                           preempt;
3179
3180         thread->chosen_processor = processor;
3181
3182         /*
3183          *      Dispatch directly onto idle processor.
3184          */
3185         if ( (thread->bound_processor == processor)
3186                 && processor->state == PROCESSOR_IDLE) {
3187                 remqueue((queue_entry_t)processor);
3188                 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3189
3190                 processor->next_thread = thread;
3191                 processor->current_pri = thread->sched_pri;
3192                 processor->current_thmode = thread->sched_mode;
3193                 processor->deadline = thread->realtime.deadline;
3194                 processor->state = PROCESSOR_DISPATCHING;
3195
3196                 if (processor != current_processor()) {
3197                         if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3198                                 /* cleared on exit from main processor_idle() loop */
3199                                 pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3200                                 machine_signal_idle(processor);
3201                         }
3202                 }
3203
3204                 pset_unlock(pset);
3205                 return;
3206         }
3207
3208         if (processor->current_pri < BASEPRI_RTQUEUES)
3209                 preempt = (AST_PREEMPT | AST_URGENT);
3210         else if (thread->realtime.deadline < processor->deadline)
3211                 preempt = (AST_PREEMPT | AST_URGENT);
3212         else
3213                 preempt = AST_NONE;
3214
3215         realtime_queue_insert(thread);
3216
3217         if (preempt != AST_NONE) {
3218                 if (processor->state == PROCESSOR_IDLE) {
3219                         remqueue((queue_entry_t)processor);
3220                         enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3221                         processor->next_thread = THREAD_NULL;
3222                         processor->current_pri = thread->sched_pri;
3223                         processor->current_thmode = thread->sched_mode;
3224                         processor->deadline = thread->realtime.deadline;
3225                         processor->state = PROCESSOR_DISPATCHING;
3226                         if (processor == current_processor()) {
3227                                 ast_on(preempt);
3228                         } else {
3229                                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3230                                         /* cleared on exit from main processor_idle() loop */
3231                                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3232                                         machine_signal_idle(processor);
3233                                 }
3234                         }
3235                 } else if (processor->state == PROCESSOR_DISPATCHING) {
3236                         if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
3237                                 processor->current_pri = thread->sched_pri;
3238                                 processor->current_thmode = thread->sched_mode;
3239                                 processor->deadline = thread->realtime.deadline;
3240                         }
3241                 } else {
3242                         if (processor == current_processor()) {
3243                                 ast_on(preempt);
3244                         } else {
3245                                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3246                                         /* cleared after IPI causes csw_check() to be called */
3247                                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3248                                         cause_ast_check(processor);
3249                                 }
3250                         }
3251                 }
3252         } else {
3253                 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
3254         }
3255
3256         pset_unlock(pset);
3257 }
3258
3259 #if defined(CONFIG_SCHED_TRADITIONAL)
3260
3261 static boolean_t
3262 priority_is_urgent(int priority)
3263 {
3264         return testbit(priority, sched_preempt_pri) ? TRUE : FALSE;
3265 }
3266
3267 /*
3268  *      processor_enqueue:
3269  *
3270  *      Enqueue thread on a processor run queue.  Thread must be locked,
3271  *      and not already be on a run queue.
3272  *
3273  *      Returns TRUE if a preemption is indicated based on the state
3274  *      of the run queue.
3275  *
3276  *      The run queue must be locked (see thread_run_queue_remove()
3277  *      for more info).
3278  */
3279 static boolean_t
3280 processor_enqueue(
3281         processor_t             processor,
3282         thread_t                thread,
3283         integer_t               options)
3284 {
3285         run_queue_t             rq = runq_for_processor(processor);
3286         boolean_t               result;
3287
3288         result = run_queue_enqueue(rq, thread, options);
3289         thread->runq = processor;
3290         runq_consider_incr_bound_count(processor, thread);
3291
3292         return (result);
3293 }
3294
3295 #endif /* CONFIG_SCHED_TRADITIONAL */
3296
3297 /*
3298  *      processor_setrun:
3299  *
3300  *      Dispatch a thread for execution on a
3301  *      processor.
3302  *
3303  *      Thread must be locked.  Associated pset must
3304  *      be locked, and is returned unlocked.
3305  */
3306 static void
3307 processor_setrun(
3308         processor_t                     processor,
3309         thread_t                        thread,
3310         integer_t                       options)
3311 {
3312         processor_set_t         pset = processor->processor_set;
3313         ast_t                           preempt;
3314         enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
3315
3316         thread->chosen_processor = processor;
3317
3318         /*
3319          *      Dispatch directly onto idle processor.
3320          */
3321         if ( (SCHED(direct_dispatch_to_idle_processors) ||
3322                   thread->bound_processor == processor)
3323                 && processor->state == PROCESSOR_IDLE) {
3324                 remqueue((queue_entry_t)processor);
3325                 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3326
3327                 processor->next_thread = thread;
3328                 processor->current_pri = thread->sched_pri;
3329                 processor->current_thmode = thread->sched_mode;
3330                 processor->deadline = UINT64_MAX;
3331                 processor->state = PROCESSOR_DISPATCHING;
3332
3333                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3334                         /* cleared on exit from main processor_idle() loop */
3335                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3336                         machine_signal_idle(processor);
3337                 }
3338
3339                 pset_unlock(pset);
3340                 return;
3341         }
3342
3343         /*
3344          *      Set preemption mode.
3345          */
3346         if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3347                 preempt = (AST_PREEMPT | AST_URGENT);
3348         else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
3349                 preempt = (AST_PREEMPT | AST_URGENT);
3350         else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->priority)) {
3351                 if(SCHED(priority_is_urgent)(thread->priority) && thread->sched_pri > processor->current_pri) {
3352                         preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3353                 } else {
3354                         preempt = AST_NONE;
3355                 }
3356         } else
3357                 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3358
3359         SCHED(processor_enqueue)(processor, thread, options);
3360
3361         if (preempt != AST_NONE) {
3362                 if (processor->state == PROCESSOR_IDLE) {
3363                         remqueue((queue_entry_t)processor);
3364                         enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3365                         processor->next_thread = THREAD_NULL;
3366                         processor->current_pri = thread->sched_pri;
3367                         processor->current_thmode = thread->sched_mode;
3368                         processor->deadline = UINT64_MAX;
3369                         processor->state = PROCESSOR_DISPATCHING;
3370
3371                         ipi_action = eExitIdle;
3372                 } else if ( processor->state == PROCESSOR_DISPATCHING) {
3373                         if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
3374                                 processor->current_pri = thread->sched_pri;
3375                                 processor->current_thmode = thread->sched_mode;
3376                                 processor->deadline = UINT64_MAX;
3377                         }
3378                 } else if (     (processor->state == PROCESSOR_RUNNING          ||
3379                                  processor->state == PROCESSOR_SHUTDOWN)                &&
3380                                 (thread->sched_pri >= processor->current_pri    ||
3381                                 processor->current_thmode == TH_MODE_FAIRSHARE)) {
3382                         ipi_action = eInterruptRunning;
3383                 }
3384         } else {
3385                 /*
3386                  * New thread is not important enough to preempt what is running, but
3387                  * special processor states may need special handling
3388                  */
3389                 if (processor->state == PROCESSOR_SHUTDOWN              &&
3390                         thread->sched_pri >= processor->current_pri     ) {
3391                         ipi_action = eInterruptRunning;
3392                 } else if (     processor->state == PROCESSOR_IDLE      &&
3393                                         processor != current_processor()        ) {
3394                         remqueue((queue_entry_t)processor);
3395                         enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3396                         processor->next_thread = THREAD_NULL;
3397                         processor->current_pri = thread->sched_pri;
3398                         processor->current_thmode = thread->sched_mode;
3399                         processor->deadline = UINT64_MAX;
3400                         processor->state = PROCESSOR_DISPATCHING;
3401
3402                         ipi_action = eExitIdle;
3403                 }
3404         }
3405
3406         switch (ipi_action) {
3407                 case eDoNothing:
3408                         break;
3409                 case eExitIdle:
3410                         if (processor == current_processor()) {
3411                                 if (csw_check_locked(processor, pset) != AST_NONE)
3412                                         ast_on(preempt);
3413                         } else {
3414                                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3415                                         /* cleared on exit from main processor_idle() loop */
3416                                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3417                                         machine_signal_idle(processor);
3418                                 }
3419                         }
3420                         break;
3421                 case eInterruptRunning:
3422                         if (processor == current_processor()) {
3423                                 if (csw_check_locked(processor, pset) != AST_NONE)
3424                                         ast_on(preempt);
3425                         } else {
3426                                 if (!(pset->pending_AST_cpu_mask & (1U << processor->cpu_id))) {
3427                                         /* cleared after IPI causes csw_check() to be called */
3428                                         pset->pending_AST_cpu_mask |= (1U << processor->cpu_id);
3429                                         cause_ast_check(processor);
3430                                 }
3431                         }
3432                         break;
3433         }
3434
3435         pset_unlock(pset);
3436 }
3437
3438 #if defined(CONFIG_SCHED_TRADITIONAL)
3439
3440 static boolean_t
3441 processor_queue_empty(processor_t               processor)
3442 {
3443         return runq_for_processor(processor)->count == 0;
3444
3445 }
3446
3447 static boolean_t
3448 sched_traditional_with_pset_runqueue_processor_queue_empty(processor_t          processor)
3449 {
3450         processor_set_t pset = processor->processor_set;
3451         int count = runq_for_processor(processor)->count;
3452
3453         /*
3454          * The pset runq contains the count of all runnable threads
3455          * for all processors in the pset. However, for threads that
3456          * are bound to another processor, the current "processor"
3457          * is not eligible to execute the thread. So we only
3458          * include bound threads that our bound to the current
3459          * "processor". This allows the processor to idle when the
3460          * count of eligible threads drops to 0, even if there's
3461          * a runnable thread bound to a different processor in the
3462          * shared runq.
3463          */
3464
3465         count -= pset->pset_runq_bound_count;
3466         count += processor->runq_bound_count;
3467
3468         return count == 0;
3469 }
3470
3471 static ast_t
3472 processor_csw_check(processor_t processor)
3473 {
3474         run_queue_t             runq;
3475         boolean_t               has_higher;
3476
3477         assert(processor->active_thread != NULL);
3478
3479         runq = runq_for_processor(processor);
3480         if (first_timeslice(processor)) {
3481                 has_higher = (runq->highq > processor->current_pri);
3482         } else {
3483                 has_higher = (runq->highq >= processor->current_pri);
3484         }
3485         if (has_higher) {
3486                 if (runq->urgency > 0)
3487                         return (AST_PREEMPT | AST_URGENT);
3488
3489                 if (processor->active_thread && thread_eager_preemption(processor->active_thread))
3490                         return (AST_PREEMPT | AST_URGENT);
3491
3492                 return AST_PREEMPT;
3493         }
3494
3495         return AST_NONE;
3496 }
3497
3498 static boolean_t
3499 processor_queue_has_priority(processor_t                processor,
3500                                                          int                            priority,
3501                                                          boolean_t                      gte)
3502 {
3503         if (gte)
3504                 return runq_for_processor(processor)->highq >= priority;
3505         else
3506                 return runq_for_processor(processor)->highq > priority;
3507 }
3508
3509 static boolean_t
3510 should_current_thread_rechoose_processor(processor_t                    processor)
3511 {
3512         return (processor->current_pri < BASEPRI_RTQUEUES
3513                         && processor->processor_meta != PROCESSOR_META_NULL
3514                         && processor->processor_meta->primary != processor);
3515 }
3516
3517 static int
3518 sched_traditional_processor_runq_count(processor_t   processor)
3519 {
3520         return runq_for_processor(processor)->count;
3521 }
3522
3523
3524 static uint64_t
3525 sched_traditional_processor_runq_stats_count_sum(processor_t   processor)
3526 {
3527         return runq_for_processor(processor)->runq_stats.count_sum;
3528 }
3529
3530 static uint64_t
3531 sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum(processor_t   processor)
3532 {
3533         if (processor->cpu_id == processor->processor_set->cpu_set_low)
3534                 return runq_for_processor(processor)->runq_stats.count_sum;
3535         else
3536                 return 0ULL;
3537 }
3538
3539 #endif /* CONFIG_SCHED_TRADITIONAL */
3540
3541 #define next_pset(p)    (((p)->pset_list != PROCESSOR_SET_NULL)? (p)->pset_list: (p)->node->psets)
3542
3543 /*
3544  *      choose_next_pset:
3545  *
3546  *      Return the next sibling pset containing
3547  *      available processors.
3548  *
3549  *      Returns the original pset if none other is
3550  *      suitable.
3551  */
3552 static processor_set_t
3553 choose_next_pset(
3554         processor_set_t         pset)
3555 {
3556         processor_set_t         nset = pset;
3557
3558         do {
3559                 nset = next_pset(nset);
3560         } while (nset->online_processor_count < 1 && nset != pset);
3561
3562         return (nset);
3563 }
3564
3565 /*
3566  *      choose_processor:
3567  *
3568  *      Choose a processor for the thread, beginning at
3569  *      the pset.  Accepts an optional processor hint in
3570  *      the pset.
3571  *
3572  *      Returns a processor, possibly from a different pset.
3573  *
3574  *      The thread must be locked.  The pset must be locked,
3575  *      and the resulting pset is locked on return.
3576  */
3577 processor_t
3578 choose_processor(
3579         processor_set_t         pset,
3580         processor_t                     processor,
3581         thread_t                        thread)
3582 {
3583         processor_set_t         nset, cset = pset;
3584         processor_meta_t        pmeta = PROCESSOR_META_NULL;
3585         processor_t             mprocessor;
3586
3587         /*
3588          *      Prefer the hinted processor, when appropriate.
3589          */
3590
3591         if (processor != PROCESSOR_NULL) {
3592                 if (processor->processor_meta != PROCESSOR_META_NULL)
3593                         processor = processor->processor_meta->primary;
3594         }
3595
3596         mprocessor = machine_choose_processor(pset, processor);
3597         if (mprocessor != PROCESSOR_NULL)
3598                 processor = mprocessor;
3599
3600         if (processor != PROCESSOR_NULL) {
3601                 if (processor->processor_set != pset ||
3602                     processor->state == PROCESSOR_INACTIVE ||
3603                     processor->state == PROCESSOR_SHUTDOWN ||
3604                     processor->state == PROCESSOR_OFF_LINE)
3605                         processor = PROCESSOR_NULL;
3606                 else
3607                         if (processor->state == PROCESSOR_IDLE ||
3608                             ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3609                             (processor->current_pri < BASEPRI_RTQUEUES)))
3610                                 return (processor);
3611         }
3612
3613         /*
3614          *      Iterate through the processor sets to locate
3615          *      an appropriate processor.
3616          */
3617         do {
3618                 /*
3619                  *      Choose an idle processor.
3620                  */
3621                 if (!queue_empty(&cset->idle_queue))
3622                         return ((processor_t)queue_first(&cset->idle_queue));
3623
3624                 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3625                         integer_t lowest_priority = MAXPRI + 1;
3626                         integer_t lowest_unpaired = MAXPRI + 1;
3627                         uint64_t  furthest_deadline = 1;
3628                         processor_t lp_processor = PROCESSOR_NULL;
3629                         processor_t lp_unpaired = PROCESSOR_NULL;
3630                         processor_t fd_processor = PROCESSOR_NULL;
3631
3632                         lp_processor = cset->low_pri;
3633                         /* Consider hinted processor */
3634                         if (lp_processor != PROCESSOR_NULL &&
3635                             ((lp_processor->processor_meta == PROCESSOR_META_NULL) ||
3636                             ((lp_processor == lp_processor->processor_meta->primary) &&
3637                             !queue_empty(&lp_processor->processor_meta->idle_queue))) &&
3638                             lp_processor->state != PROCESSOR_INACTIVE &&
3639                             lp_processor->state != PROCESSOR_SHUTDOWN &&
3640                             lp_processor->state != PROCESSOR_OFF_LINE &&
3641                             (lp_processor->current_pri < thread->sched_pri))
3642                                 return lp_processor;
3643
3644                         processor = (processor_t)queue_first(&cset->active_queue);
3645                         while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) {
3646                                 /* Discover the processor executing the
3647                                  * thread with the lowest priority within
3648                                  * this pset, or the one with the furthest
3649                                  * deadline
3650                                  */
3651                                 integer_t cpri = processor->current_pri;
3652                                 if (cpri < lowest_priority) {
3653                                         lowest_priority = cpri;
3654                                         lp_processor = processor;
3655                                 }
3656
3657                                 if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3658                                         furthest_deadline = processor->deadline;
3659                                         fd_processor = processor;
3660                                 }
3661
3662
3663                                 if (processor->processor_meta != PROCESSOR_META_NULL &&
3664                                     !queue_empty(&processor->processor_meta->idle_queue)) {
3665                                         if (cpri < lowest_unpaired) {
3666                                                 lowest_unpaired = cpri;
3667                                                 lp_unpaired = processor;
3668                                                 pmeta = processor->processor_meta;
3669                                         }
3670                                         else
3671                                                 if (pmeta == PROCESSOR_META_NULL)
3672                                                         pmeta = processor->processor_meta;
3673                                 }
3674                                 processor = (processor_t)queue_next((queue_entry_t)processor);
3675                         }
3676
3677                         if (thread->sched_pri > lowest_unpaired)
3678                                 return lp_unpaired;
3679
3680                         if (pmeta != PROCESSOR_META_NULL)
3681                                 return ((processor_t)queue_first(&pmeta->idle_queue));
3682                         if (thread->sched_pri > lowest_priority)
3683                                 return lp_processor;
3684                         if (thread->realtime.deadline < furthest_deadline)
3685                                 return fd_processor;
3686
3687                         processor = PROCESSOR_NULL;
3688                 }
3689                 else {
3690                         /*
3691                          *      Check any hinted processors in the processor set if available.
3692                          */
3693                         if (cset->low_pri != PROCESSOR_NULL && cset->low_pri->state != PROCESSOR_INACTIVE &&
3694                                         cset->low_pri->state != PROCESSOR_SHUTDOWN && cset->low_pri->state != PROCESSOR_OFF_LINE &&
3695                                                 (processor == PROCESSOR_NULL ||
3696                                                         (thread->sched_pri > BASEPRI_DEFAULT && cset->low_pri->current_pri < thread->sched_pri))) {
3697                                 processor = cset->low_pri;
3698                         }
3699                         else
3700                         if (cset->low_count != PROCESSOR_NULL && cset->low_count->state != PROCESSOR_INACTIVE &&
3701                                         cset->low_count->state != PROCESSOR_SHUTDOWN && cset->low_count->state != PROCESSOR_OFF_LINE &&
3702                                                 (processor == PROCESSOR_NULL || (thread->sched_pri <= BASEPRI_DEFAULT &&
3703                                                                                                                         SCHED(processor_runq_count)(cset->low_count) < SCHED(processor_runq_count)(processor)))) {
3704                                 processor = cset->low_count;
3705                         }
3706
3707                         /*
3708                          *      Otherwise, choose an available processor in the set.
3709                          */
3710                         if (processor == PROCESSOR_NULL) {
3711                                 processor = (processor_t)dequeue_head(&cset->active_queue);
3712                                 if (processor != PROCESSOR_NULL)
3713                                         enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
3714                         }
3715
3716                         if (processor != PROCESSOR_NULL && pmeta == PROCESSOR_META_NULL) {
3717                                 if (processor->processor_meta != PROCESSOR_META_NULL &&
3718                                                                                         !queue_empty(&processor->processor_meta->idle_queue))
3719                                         pmeta = processor->processor_meta;
3720                         }
3721                 }
3722
3723                 /*
3724                  *      Move onto the next processor set.
3725                  */
3726                 nset = next_pset(cset);
3727
3728                 if (nset != pset) {
3729                         pset_unlock(cset);
3730
3731                         cset = nset;
3732                         pset_lock(cset);
3733                 }
3734         } while (nset != pset);
3735
3736         /*
3737          *      Make sure that we pick a running processor,
3738          *      and that the correct processor set is locked.
3739          */
3740         do {
3741                 if (pmeta != PROCESSOR_META_NULL) {
3742                         if (cset != pmeta->primary->processor_set) {
3743                                 pset_unlock(cset);
3744
3745                                 cset = pmeta->primary->processor_set;
3746                                 pset_lock(cset);
3747                         }
3748
3749                         if (!queue_empty(&pmeta->idle_queue))
3750                                 return ((processor_t)queue_first(&pmeta->idle_queue));
3751
3752                         pmeta = PROCESSOR_META_NULL;
3753                 }
3754
3755                 /*
3756                  *      If we haven't been able to choose a processor,
3757                  *      pick the boot processor and return it.
3758                  */
3759                 if (processor == PROCESSOR_NULL) {
3760                         processor = master_processor;
3761
3762                         /*
3763                          *      Check that the correct processor set is
3764                          *      returned locked.
3765                          */
3766                         if (cset != processor->processor_set) {
3767                                 pset_unlock(cset);
3768
3769                                 cset = processor->processor_set;
3770                                 pset_lock(cset);
3771                         }
3772
3773                         return (processor);
3774                 }
3775
3776                 /*
3777                  *      Check that the processor set for the chosen
3778                  *      processor is locked.
3779                  */
3780                 if (cset != processor->processor_set) {
3781                         pset_unlock(cset);
3782
3783                         cset = processor->processor_set;
3784                         pset_lock(cset);
3785                 }
3786
3787                 /*
3788                  *      We must verify that the chosen processor is still available.
3789                  */
3790                 if (processor->state == PROCESSOR_INACTIVE ||
3791                                         processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
3792                         processor = PROCESSOR_NULL;
3793         } while (processor == PROCESSOR_NULL);
3794
3795         return (processor);
3796 }
3797
3798 /*
3799  *      thread_setrun:
3800  *
3801  *      Dispatch thread for execution, onto an idle
3802  *      processor or run queue, and signal a preemption
3803  *      as appropriate.
3804  *
3805  *      Thread must be locked.
3806  */
3807 void
3808 thread_setrun(
3809         thread_t                        thread,
3810         integer_t                       options)
3811 {
3812         processor_t                     processor;
3813         processor_set_t         pset;
3814
3815 #if DEBUG
3816         assert(thread_runnable(thread));
3817 #endif
3818
3819         /*
3820          *      Update priority if needed.
3821          */
3822         if (SCHED(can_update_priority)(thread))
3823                 SCHED(update_priority)(thread);
3824
3825         assert(thread->runq == PROCESSOR_NULL);
3826
3827         if (thread->bound_processor == PROCESSOR_NULL) {
3828                 /*
3829                  *      Unbound case.
3830                  */
3831                 if (thread->affinity_set != AFFINITY_SET_NULL) {
3832                         /*
3833                          * Use affinity set policy hint.
3834                          */
3835                         pset = thread->affinity_set->aset_pset;
3836                         pset_lock(pset);
3837
3838                         processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3839
3840                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3841                                                                           (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3842                 }
3843                 else
3844                 if (thread->last_processor != PROCESSOR_NULL) {
3845                         /*
3846                          *      Simple (last processor) affinity case.
3847                          */
3848                         processor = thread->last_processor;
3849                         pset = processor->processor_set;
3850                         pset_lock(pset);
3851                         processor = SCHED(choose_processor)(pset, processor, thread);
3852
3853                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3854                                                                   (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
3855                 }
3856                 else {
3857                         /*
3858                          *      No Affinity case:
3859                          *
3860                          *      Utilitize a per task hint to spread threads
3861                          *      among the available processor sets.
3862                          */
3863                         task_t          task = thread->task;
3864
3865                         pset = task->pset_hint;
3866                         if (pset == PROCESSOR_SET_NULL)
3867                                 pset = current_processor()->processor_set;
3868
3869                         pset = choose_next_pset(pset);
3870                         pset_lock(pset);
3871
3872                         processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3873                         task->pset_hint = processor->processor_set;
3874
3875                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3876                                                                           (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3877                 }
3878         }
3879         else {
3880                 /*
3881                  *      Bound case:
3882                  *
3883                  *      Unconditionally dispatch on the processor.
3884                  */
3885                 processor = thread->bound_processor;
3886                 pset = processor->processor_set;
3887                 pset_lock(pset);
3888
3889                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3890                                                           (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
3891         }
3892
3893         /*
3894          *      Dispatch the thread on the choosen processor.
3895          */
3896         if (thread->sched_pri >= BASEPRI_RTQUEUES)
3897                 realtime_setrun(processor, thread);
3898         else if (thread->sched_mode == TH_MODE_FAIRSHARE)
3899                 fairshare_setrun(processor, thread);
3900         else
3901                 processor_setrun(processor, thread, options);
3902 }
3903
3904 processor_set_t
3905 task_choose_pset(
3906         task_t          task)
3907 {
3908         processor_set_t         pset = task->pset_hint;
3909
3910         if (pset != PROCESSOR_SET_NULL)
3911                 pset = choose_next_pset(pset);
3912
3913         return (pset);
3914 }
3915
3916 #if defined(CONFIG_SCHED_TRADITIONAL)
3917
3918 /*
3919  *      processor_queue_shutdown:
3920  *
3921  *      Shutdown a processor run queue by
3922  *      re-dispatching non-bound threads.
3923  *
3924  *      Associated pset must be locked, and is
3925  *      returned unlocked.
3926  */
3927 void
3928 processor_queue_shutdown(
3929         processor_t                     processor)
3930 {
3931         processor_set_t         pset = processor->processor_set;
3932         run_queue_t                     rq = runq_for_processor(processor);
3933         queue_t                         queue = rq->queues + rq->highq;
3934         int                                     pri = rq->highq, count = rq->count;
3935         thread_t                        next, thread;
3936         queue_head_t            tqueue;
3937
3938         queue_init(&tqueue);
3939
3940         while (count > 0) {
3941                 thread = (thread_t)queue_first(queue);
3942                 while (!queue_end(queue, (queue_entry_t)thread)) {
3943                         next = (thread_t)queue_next((queue_entry_t)thread);
3944
3945                         if (thread->bound_processor == PROCESSOR_NULL) {
3946                                 remqueue((queue_entry_t)thread);
3947
3948                                 thread->runq = PROCESSOR_NULL;
3949                                 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3950                                 runq_consider_decr_bound_count(processor, thread);
3951                                 rq->count--;
3952                                 if (SCHED(priority_is_urgent)(pri)) {
3953                                         rq->urgency--; assert(rq->urgency >= 0);
3954                                 }
3955                                 if (queue_empty(queue)) {
3956                                         if (pri != IDLEPRI)
3957                                                 clrbit(MAXPRI - pri, rq->bitmap);
3958                                         rq->highq = MAXPRI - ffsbit(rq->bitmap);
3959                                 }
3960
3961                                 enqueue_tail(&tqueue, (queue_entry_t)thread);
3962                         }
3963                         count--;
3964
3965                         thread = next;
3966                 }
3967
3968                 queue--; pri--;
3969         }
3970
3971         pset_unlock(pset);
3972
3973         while ((thread = (thread_t)dequeue_head(&tqueue)) != THREAD_NULL) {
3974                 thread_lock(thread);
3975
3976                 thread_setrun(thread, SCHED_TAILQ);
3977
3978                 thread_unlock(thread);
3979         }
3980 }
3981
3982 #endif /* CONFIG_SCHED_TRADITIONAL */
3983
3984 /*
3985  *      Check for a preemption point in
3986  *      the current context.
3987  *
3988  *      Called at splsched.
3989  */
3990 ast_t
3991 csw_check(
3992         processor_t             processor)
3993 {
3994         processor_set_t pset = processor->processor_set;
3995         ast_t                   result;
3996
3997         pset_lock(pset);
3998
3999         /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
4000         pset->pending_AST_cpu_mask &= ~(1U << processor->cpu_id);
4001
4002         result = csw_check_locked(processor, pset);
4003
4004         pset_unlock(pset);
4005
4006         return result;
4007 }
4008
4009 /*
4010  * Check for preemption at splsched with
4011  * pset locked
4012  */
4013 ast_t
4014 csw_check_locked(
4015         processor_t             processor,
4016         processor_set_t pset __unused)
4017 {
4018         ast_t                   result = AST_NONE;
4019         thread_t                thread = processor->active_thread;
4020
4021         if (first_timeslice(processor)) {
4022                 if (rt_runq.count > 0)
4023                         return (AST_PREEMPT | AST_URGENT);
4024         }
4025         else {
4026                 if (rt_runq.count > 0) {
4027                         if (BASEPRI_RTQUEUES > processor->current_pri)
4028                                 return (AST_PREEMPT | AST_URGENT);
4029                         else
4030                                 return (AST_PREEMPT);
4031                 }
4032         }
4033
4034         result = SCHED(processor_csw_check)(processor);
4035         if (result != AST_NONE)
4036                 return (result);
4037
4038         if (SCHED(should_current_thread_rechoose_processor)(processor))
4039                 return (AST_PREEMPT);
4040
4041         if (machine_processor_is_inactive(processor))
4042                 return (AST_PREEMPT);
4043
4044         if (thread->state & TH_SUSP)
4045                 return (AST_PREEMPT);
4046
4047         return (AST_NONE);
4048 }
4049
4050 /*
4051  *      set_sched_pri:
4052  *
4053  *      Set the scheduled priority of the specified thread.
4054  *
4055  *      This may cause the thread to change queues.
4056  *
4057  *      Thread must be locked.
4058  */
4059 void
4060 set_sched_pri(
4061         thread_t                thread,
4062         int                             priority)
4063 {
4064         boolean_t               removed = thread_run_queue_remove(thread);
4065
4066         thread->sched_pri = priority;
4067         if (removed)
4068                 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
4069         else
4070         if (thread->state & TH_RUN) {
4071                 processor_t             processor = thread->last_processor;
4072
4073                 if (thread == current_thread()) {
4074                         ast_t                   preempt;
4075
4076                         processor->current_pri = priority;
4077                         processor->current_thmode = thread->sched_mode;
4078                         if ((preempt = csw_check(processor)) != AST_NONE)
4079                                 ast_on(preempt);
4080                 }
4081                 else
4082                 if (    processor != PROCESSOR_NULL                                             &&
4083                                 processor->active_thread == thread      )
4084                         cause_ast_check(processor);
4085         }
4086 }
4087
4088 #if             0
4089
4090 static void
4091 run_queue_check(
4092         run_queue_t             rq,
4093         thread_t                thread)
4094 {
4095         queue_t                 q;
4096         queue_entry_t   qe;
4097
4098         if (rq != thread->runq)
4099                 panic("run_queue_check: thread runq");
4100
4101         if (thread->sched_pri > MAXPRI || thread->sched_pri < MINPRI)
4102                 panic("run_queue_check: thread sched_pri");
4103
4104         q = &rq->queues[thread->sched_pri];
4105         qe = queue_first(q);
4106         while (!queue_end(q, qe)) {
4107                 if (qe == (queue_entry_t)thread)
4108                         return;
4109
4110                 qe = queue_next(qe);
4111         }
4112
4113         panic("run_queue_check: end");
4114 }
4115
4116 #endif  /* DEBUG */
4117
4118 #if defined(CONFIG_SCHED_TRADITIONAL)
4119
4120 /* locks the runqueue itself */
4121
4122 static boolean_t
4123 processor_queue_remove(
4124                                            processor_t                  processor,
4125                                            thread_t             thread)
4126 {
4127         void *                  rqlock;
4128         run_queue_t             rq;
4129
4130         rqlock = &processor->processor_set->sched_lock;
4131         rq = runq_for_processor(processor);
4132
4133         simple_lock(rqlock);
4134         if (processor == thread->runq) {
4135                 /*
4136                  *      Thread is on a run queue and we have a lock on
4137                  *      that run queue.
4138                  */
4139                 runq_consider_decr_bound_count(processor, thread);
4140                 run_queue_remove(rq, thread);
4141         }
4142         else {
4143                 /*
4144                  *      The thread left the run queue before we could
4145                  *      lock the run queue.
4146                  */
4147                 assert(thread->runq == PROCESSOR_NULL);
4148                 processor = PROCESSOR_NULL;
4149         }
4150
4151         simple_unlock(rqlock);
4152
4153         return (processor != PROCESSOR_NULL);
4154 }
4155
4156 #endif /* CONFIG_SCHED_TRADITIONAL */
4157
4158 /*
4159  *      thread_run_queue_remove:
4160  *
4161  *      Remove a thread from a current run queue and
4162  *      return TRUE if successful.
4163  *
4164  *      Thread must be locked.
4165  */
4166 boolean_t
4167 thread_run_queue_remove(
4168         thread_t                thread)
4169 {
4170         processor_t             processor = thread->runq;
4171
4172         /*
4173          *      If processor is PROCESSOR_NULL, the thread will stay out of the
4174          *      run queues because the caller locked the thread.  Otherwise
4175          *      the thread is on a run queue, but could be chosen for dispatch
4176          *      and removed.
4177          */
4178         if (processor != PROCESSOR_NULL) {
4179                 queue_t                 q;
4180
4181                 /*
4182                  *      The processor run queues are locked by the
4183                  *      processor set.  Real-time priorities use a
4184                  *      global queue with a dedicated lock.
4185                  */
4186                 if (thread->sched_mode == TH_MODE_FAIRSHARE) {
4187                         return SCHED(fairshare_queue_remove)(thread);
4188                 }
4189
4190                 if (thread->sched_pri < BASEPRI_RTQUEUES) {
4191                         return SCHED(processor_queue_remove)(processor, thread);
4192                 }
4193
4194                 simple_lock(&rt_lock);
4195                 q = &rt_runq.queue;
4196
4197                 if (processor == thread->runq) {
4198                         /*
4199                          *      Thread is on a run queue and we have a lock on
4200                          *      that run queue.
4201                          */
4202                         remqueue((queue_entry_t)thread);
4203                         SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
4204                         rt_runq.count--;
4205
4206                         thread->runq = PROCESSOR_NULL;
4207                 }
4208                 else {
4209                         /*
4210                          *      The thread left the run queue before we could
4211                          *      lock the run queue.
4212                          */
4213                         assert(thread->runq == PROCESSOR_NULL);
4214                         processor = PROCESSOR_NULL;
4215                 }
4216
4217                 simple_unlock(&rt_lock);
4218         }
4219
4220         return (processor != PROCESSOR_NULL);
4221 }
4222
4223 #if defined(CONFIG_SCHED_TRADITIONAL)
4224
4225 /*
4226  *      steal_processor_thread:
4227  *
4228  *      Locate a thread to steal from the processor and
4229  *      return it.
4230  *
4231  *      Associated pset must be locked.  Returns THREAD_NULL
4232  *      on failure.
4233  */
4234 static thread_t
4235 steal_processor_thread(
4236         processor_t             processor)
4237 {
4238         run_queue_t             rq = runq_for_processor(processor);
4239         queue_t                 queue = rq->queues + rq->highq;
4240         int                             pri = rq->highq, count = rq->count;
4241         thread_t                thread;
4242
4243         while (count > 0) {
4244                 thread = (thread_t)queue_first(queue);
4245                 while (!queue_end(queue, (queue_entry_t)thread)) {
4246                         if (thread->bound_processor == PROCESSOR_NULL) {
4247                                 remqueue((queue_entry_t)thread);
4248
4249                                 thread->runq = PROCESSOR_NULL;
4250                                 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4251                                 runq_consider_decr_bound_count(processor, thread);
4252                                 rq->count--;
4253                                 if (SCHED(priority_is_urgent)(pri)) {
4254                                         rq->urgency--; assert(rq->urgency >= 0);
4255                                 }
4256                                 if (queue_empty(queue)) {
4257                                         if (pri != IDLEPRI)
4258                                                 clrbit(MAXPRI - pri, rq->bitmap);
4259                                         rq->highq = MAXPRI - ffsbit(rq->bitmap);
4260                                 }
4261
4262                                 return (thread);
4263                         }
4264                         count--;
4265
4266                         thread = (thread_t)queue_next((queue_entry_t)thread);
4267                 }
4268
4269                 queue--; pri--;
4270         }
4271
4272         return (THREAD_NULL);
4273 }
4274
4275 /*
4276  *      Locate and steal a thread, beginning
4277  *      at the pset.
4278  *
4279  *      The pset must be locked, and is returned
4280  *      unlocked.
4281  *
4282  *      Returns the stolen thread, or THREAD_NULL on
4283  *      failure.
4284  */
4285 static thread_t
4286 steal_thread(
4287         processor_set_t         pset)
4288 {
4289         processor_set_t         nset, cset = pset;
4290         processor_t                     processor;
4291         thread_t                        thread;
4292
4293         do {
4294                 processor = (processor_t)queue_first(&cset->active_queue);
4295                 while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) {
4296                         if (runq_for_processor(processor)->count > 0) {
4297                                 thread = steal_processor_thread(processor);
4298                                 if (thread != THREAD_NULL) {
4299                                         remqueue((queue_entry_t)processor);
4300                                         enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
4301
4302                                         pset_unlock(cset);
4303
4304                                         return (thread);
4305                                 }
4306                         }
4307
4308                         processor = (processor_t)queue_next((queue_entry_t)processor);
4309                 }
4310
4311                 nset = next_pset(cset);
4312
4313                 if (nset != pset) {
4314                         pset_unlock(cset);
4315
4316                         cset = nset;
4317                         pset_lock(cset);
4318                 }
4319         } while (nset != pset);
4320
4321         pset_unlock(cset);
4322
4323         return (THREAD_NULL);
4324 }
4325
4326 static thread_t steal_thread_disabled(
4327                                         processor_set_t         pset)
4328 {
4329         pset_unlock(pset);
4330
4331         return (THREAD_NULL);
4332 }
4333
4334 #endif /* CONFIG_SCHED_TRADITIONAL */
4335
4336
4337 void
4338 sys_override_cpu_throttle(int flag)
4339 {
4340         if (flag == CPU_THROTTLE_ENABLE)
4341                 cpu_throttle_enabled = 1;
4342         if (flag == CPU_THROTTLE_DISABLE)
4343                 cpu_throttle_enabled = 0;
4344 }
4345
4346 int
4347 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
4348 {
4349         if (thread == NULL || (thread->state & TH_IDLE)) {
4350                 *arg1 = 0;
4351                 *arg2 = 0;
4352
4353                 return (THREAD_URGENCY_NONE);
4354         } else if (thread->sched_mode == TH_MODE_REALTIME) {
4355                 *arg1 = thread->realtime.period;
4356                 *arg2 = thread->realtime.deadline;
4357
4358                 return (THREAD_URGENCY_REAL_TIME);
4359         } else if (cpu_throttle_enabled &&
4360                    ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->priority <= MAXPRI_THROTTLE)))  {
4361                 /*
4362                  * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
4363                  */
4364                 *arg1 = thread->sched_pri;
4365                 *arg2 = thread->priority;
4366
4367                 return (THREAD_URGENCY_BACKGROUND);
4368         } else {
4369                 *arg1 = thread->sched_pri;
4370                 *arg2 = thread->priority;
4371
4372                 return (THREAD_URGENCY_NORMAL);
4373         }
4374 }
4375
4376
4377 /*
4378  *      This is the processor idle loop, which just looks for other threads
4379  *      to execute.  Processor idle threads invoke this without supplying a
4380  *      current thread to idle without an asserted wait state.
4381  *
4382  *      Returns a the next thread to execute if dispatched directly.
4383  */
4384
4385 #if 0
4386 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4387 #else
4388 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4389 #endif
4390
4391 thread_t
4392 processor_idle(
4393         thread_t                        thread,
4394         processor_t                     processor)
4395 {
4396         processor_set_t         pset = processor->processor_set;
4397         thread_t                        new_thread;
4398         int                                     state;
4399         (void)splsched();
4400
4401         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4402                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START,
4403                 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
4404
4405         SCHED_STATS_CPU_IDLE_START(processor);
4406
4407         timer_switch(&PROCESSOR_DATA(processor, system_state),
4408                                                                         mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
4409         PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
4410
4411         while (1) {
4412
4413                 if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
4414                         break;
4415                 if (pset->pending_AST_cpu_mask & (1U << processor->cpu_id))
4416                         break;
4417                 if (!SCHED(processor_queue_empty)(processor))
4418                         break;
4419                 if (rt_runq.count)
4420                         break;
4421 #if CONFIG_SCHED_IDLE_IN_PLACE
4422                 if (thread != THREAD_NULL) {
4423                         /* Did idle-in-place thread wake up */
4424                         if ((thread->state & (TH_WAIT|TH_SUSP)) != TH_WAIT || thread->wake_active)
4425                                 break;
4426                 }
4427 #endif
4428
4429                 IDLE_KERNEL_DEBUG_CONSTANT(
4430                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
4431
4432                 machine_track_platform_idle(TRUE);
4433
4434                 machine_idle();
4435
4436                 machine_track_platform_idle(FALSE);
4437
4438                 (void)splsched();
4439
4440                 IDLE_KERNEL_DEBUG_CONSTANT(
4441                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
4442
4443                 if (processor->state == PROCESSOR_INACTIVE && !machine_processor_is_inactive(processor))
4444                         break;
4445         }
4446
4447         timer_switch(&PROCESSOR_DATA(processor, idle_state),
4448                                                                         mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
4449         PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
4450
4451         pset_lock(pset);
4452
4453         /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
4454         pset->pending_AST_cpu_mask &= ~(1U << processor->cpu_id);
4455
4456         state = processor->state;
4457         if (state == PROCESSOR_DISPATCHING) {
4458                 /*
4459                  *      Commmon case -- cpu dispatched.
4460                  */
4461                 new_thread = processor->next_thread;
4462                 processor->next_thread = THREAD_NULL;
4463                 processor->state = PROCESSOR_RUNNING;
4464
4465                 if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE)                                        ||
4466                                                                                         (rt_runq.count > 0 && BASEPRI_RTQUEUES >= new_thread->sched_pri))       ) {
4467                         processor->current_pri = IDLEPRI;
4468                         processor->current_thmode = TH_MODE_FIXED;
4469                         processor->deadline = UINT64_MAX;
4470
4471                         pset_unlock(pset);
4472
4473                         thread_lock(new_thread);
4474                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
4475                         thread_setrun(new_thread, SCHED_HEADQ);
4476                         thread_unlock(new_thread);
4477
4478                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4479                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4480                                 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4481
4482                         return (THREAD_NULL);
4483                 }
4484
4485                 pset_unlock(pset);
4486
4487                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4488                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4489                         (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
4490
4491                 return (new_thread);
4492         }
4493         else
4494         if (state == PROCESSOR_IDLE) {
4495                 remqueue((queue_entry_t)processor);
4496
4497                 processor->state = PROCESSOR_RUNNING;
4498                 processor->current_pri = IDLEPRI;
4499                 processor->current_thmode = TH_MODE_FIXED;
4500                 processor->deadline = UINT64_MAX;
4501                 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
4502         }
4503         else
4504         if (state == PROCESSOR_INACTIVE) {
4505                 processor->state = PROCESSOR_RUNNING;
4506                 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
4507         }
4508         else
4509         if (state == PROCESSOR_SHUTDOWN) {
4510                 /*
4511                  *      Going off-line.  Force a
4512                  *      reschedule.
4513                  */
4514                 if ((new_thread = processor->next_thread) != THREAD_NULL) {
4515                         processor->next_thread = THREAD_NULL;
4516                         processor->current_pri = IDLEPRI;
4517                         processor->current_thmode = TH_MODE_FIXED;
4518                         processor->deadline = UINT64_MAX;
4519
4520                         pset_unlock(pset);
4521
4522                         thread_lock(new_thread);
4523                         thread_setrun(new_thread, SCHED_HEADQ);
4524                         thread_unlock(new_thread);
4525
4526                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4527                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4528                                 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4529
4530                         return (THREAD_NULL);
4531                 }
4532         }
4533
4534         pset_unlock(pset);
4535
4536         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4537                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4538                 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4539
4540         return (THREAD_NULL);
4541 }
4542
4543 /*
4544  *      Each processor has a dedicated thread which
4545  *      executes the idle loop when there is no suitable
4546  *      previous context.
4547  */
4548 void
4549 idle_thread(void)
4550 {
4551         processor_t             processor = current_processor();
4552         thread_t                new_thread;
4553
4554         new_thread = processor_idle(THREAD_NULL, processor);
4555         if (new_thread != THREAD_NULL) {
4556                 thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4557                 /*NOTREACHED*/
4558         }
4559
4560         thread_block((thread_continue_t)idle_thread);
4561         /*NOTREACHED*/
4562 }
4563
4564 kern_return_t
4565 idle_thread_create(
4566         processor_t             processor)
4567 {
4568         kern_return_t   result;
4569         thread_t                thread;
4570         spl_t                   s;
4571
4572         result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4573         if (result != KERN_SUCCESS)
4574                 return (result);
4575
4576         s = splsched();
4577         thread_lock(thread);
4578         thread->bound_processor = processor;
4579         processor->idle_thread = thread;
4580         thread->sched_pri = thread->priority = IDLEPRI;
4581         thread->state = (TH_RUN | TH_IDLE);
4582         thread->options |= TH_OPT_IDLE_THREAD;
4583         thread_unlock(thread);
4584         splx(s);
4585
4586         thread_deallocate(thread);
4587
4588         return (KERN_SUCCESS);
4589 }
4590
4591 /*
4592  * sched_startup:
4593  *
4594  * Kicks off scheduler services.
4595  *
4596  * Called at splsched.
4597  */
4598 void
4599 sched_startup(void)
4600 {
4601         kern_return_t   result;
4602         thread_t                thread;
4603
4604         result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
4605             (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
4606         if (result != KERN_SUCCESS)
4607                 panic("sched_startup");
4608
4609         thread_deallocate(thread);
4610
4611         /*
4612          * Yield to the sched_init_thread once, to
4613          * initialize our own thread after being switched
4614          * back to.
4615          *
4616          * The current thread is the only other thread
4617          * active at this point.
4618          */
4619         thread_block(THREAD_CONTINUE_NULL);
4620 }
4621
4622 #if defined(CONFIG_SCHED_TRADITIONAL)
4623
4624 static volatile uint64_t                sched_maintenance_deadline;
4625 static uint64_t                         sched_tick_last_abstime;
4626 static uint64_t                         sched_tick_delta;
4627 uint64_t                                sched_tick_max_delta;
4628 /*
4629  *      sched_init_thread:
4630  *
4631  *      Perform periodic bookkeeping functions about ten
4632  *      times per second.
4633  */
4634 static void
4635 sched_traditional_maintenance_continue(void)
4636 {
4637         uint64_t        sched_tick_ctime;
4638         sched_tick_ctime = mach_absolute_time();
4639
4640         if (__improbable(sched_tick_last_abstime == 0)) {
4641                 sched_tick_last_abstime = sched_tick_ctime;
4642                 sched_tick_delta = 1;
4643         } else {
4644                 sched_tick_delta = ((sched_tick_ctime) - sched_tick_last_abstime) / sched_tick_interval;
4645                 /* Ensure a delta of 1, since the interval could be slightly
4646                  * smaller than the sched_tick_interval due to dispatch
4647                  * latencies.
4648                  */
4649                 sched_tick_delta = MAX(sched_tick_delta, 1);
4650
4651                 /* In the event interrupt latencies or platform
4652                  * idle events that advanced the timebase resulted
4653                  * in periods where no threads were dispatched,
4654                  * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
4655                  * iterations.
4656                  */
4657                 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
4658
4659                 sched_tick_last_abstime = sched_tick_ctime;
4660                 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
4661         }
4662
4663         /* Add a number of pseudo-ticks corresponding to the elapsed interval
4664          * This could be greater than 1 if substantial intervals where
4665          * all processors are idle occur, which rarely occurs in practice.
4666          */
4667
4668         sched_tick += sched_tick_delta;
4669
4670         /*
4671          *  Compute various averages.
4672          */
4673         compute_averages(sched_tick_delta);
4674
4675         /*
4676          *  Scan the run queues for threads which
4677          *  may need to be updated.
4678          */
4679         thread_update_scan();
4680
4681         assert_wait((event_t)sched_traditional_maintenance_continue, THREAD_UNINT);
4682         thread_block((thread_continue_t)sched_traditional_maintenance_continue);
4683         /*NOTREACHED*/
4684 }
4685
4686 static uint64_t sched_maintenance_wakeups;
4687
4688 /*
4689  * Determine if the set of routines formerly driven by a maintenance timer
4690  * must be invoked, based on a deadline comparison. Signals the scheduler
4691  * maintenance thread on deadline expiration. Must be invoked at an interval
4692  * lower than the "sched_tick_interval", currently accomplished by
4693  * invocation via the quantum expiration timer and at context switch time.
4694  * Performance matters: this routine reuses a timestamp approximating the
4695  * current absolute time received from the caller, and should perform
4696  * no more than a comparison against the deadline in the common case.
4697  */
4698 void
4699 sched_traditional_consider_maintenance(uint64_t ctime) {
4700         uint64_t ndeadline, deadline = sched_maintenance_deadline;
4701
4702         if (__improbable(ctime >= deadline)) {
4703                 if (__improbable(current_thread() == sched_maintenance_thread))
4704                         return;
4705                 OSMemoryBarrier();
4706
4707                 ndeadline = ctime + sched_tick_interval;
4708
4709                 if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
4710                         thread_wakeup((event_t)sched_traditional_maintenance_continue);
4711                         sched_maintenance_wakeups++;
4712                 }
4713         }
4714 }
4715
4716 #endif /* CONFIG_SCHED_TRADITIONAL */
4717
4718 void
4719 sched_init_thread(void (*continuation)(void))
4720 {
4721         thread_block(THREAD_CONTINUE_NULL);
4722
4723         sched_maintenance_thread = current_thread();
4724         continuation();
4725
4726         /*NOTREACHED*/
4727 }
4728
4729 #if defined(CONFIG_SCHED_TRADITIONAL)
4730
4731 /*
4732  *      thread_update_scan / runq_scan:
4733  *
4734  *      Scan the run queues to account for timesharing threads
4735  *      which need to be updated.
4736  *
4737  *      Scanner runs in two passes.  Pass one squirrels likely
4738  *      threads away in an array, pass two does the update.
4739  *
4740  *      This is necessary because the run queue is locked for
4741  *      the candidate scan, but the thread is locked for the update.
4742  *
4743  *      Array should be sized to make forward progress, without
4744  *      disabling preemption for long periods.
4745  */
4746
4747 #define THREAD_UPDATE_SIZE              128
4748
4749 static thread_t         thread_update_array[THREAD_UPDATE_SIZE];
4750 static int                      thread_update_count = 0;
4751
4752 /*
4753  *      Scan a runq for candidate threads.
4754  *
4755  *      Returns TRUE if retry is needed.
4756  */
4757 static boolean_t
4758 runq_scan(
4759         run_queue_t                             runq)
4760 {
4761         register int                    count;
4762         register queue_t                q;
4763         register thread_t               thread;
4764
4765         if ((count = runq->count) > 0) {
4766             q = runq->queues + runq->highq;
4767                 while (count > 0) {
4768                         queue_iterate(q, thread, thread_t, links) {
4769                                 if (            thread->sched_stamp != sched_tick               &&
4770                                                 (thread->sched_mode == TH_MODE_TIMESHARE)       ) {
4771                                         if (thread_update_count == THREAD_UPDATE_SIZE)
4772                                                 return (TRUE);
4773
4774                                         thread_update_array[thread_update_count++] = thread;
4775                                         thread_reference_internal(thread);
4776                                 }
4777
4778                                 count--;
4779                         }
4780
4781                         q--;
4782                 }
4783         }
4784
4785         return (FALSE);
4786 }
4787
4788 static void
4789 thread_update_scan(void)
4790 {
4791         boolean_t                       restart_needed = FALSE;
4792         processor_t                     processor = processor_list;
4793         processor_set_t         pset;
4794         thread_t                        thread;
4795         spl_t                           s;
4796
4797         do {
4798                 do {
4799                         pset = processor->processor_set;
4800
4801                         s = splsched();
4802                         pset_lock(pset);
4803
4804                         restart_needed = runq_scan(runq_for_processor(processor));
4805
4806                         pset_unlock(pset);
4807                         splx(s);
4808
4809                         if (restart_needed)
4810                                 break;
4811
4812                         thread = processor->idle_thread;
4813                         if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
4814                                 if (thread_update_count == THREAD_UPDATE_SIZE) {
4815                                         restart_needed = TRUE;
4816                                         break;
4817                                 }
4818
4819                                 thread_update_array[thread_update_count++] = thread;
4820                                 thread_reference_internal(thread);
4821                         }
4822                 } while ((processor = processor->processor_list) != NULL);
4823
4824             /*
4825              *  Ok, we now have a collection of candidates -- fix them.
4826              */
4827             while (thread_update_count > 0) {
4828                         thread = thread_update_array[--thread_update_count];
4829                         thread_update_array[thread_update_count] = THREAD_NULL;
4830
4831                         s = splsched();
4832                         thread_lock(thread);
4833                         if (    !(thread->state & (TH_WAIT))    ) {
4834                                 if (SCHED(can_update_priority)(thread))
4835                                         SCHED(update_priority)(thread);
4836                         }
4837                         thread_unlock(thread);
4838                         splx(s);
4839
4840                         thread_deallocate(thread);
4841             }
4842         } while (restart_needed);
4843 }
4844
4845 #endif /* CONFIG_SCHED_TRADITIONAL */
4846
4847 boolean_t
4848 thread_eager_preemption(thread_t thread)
4849 {
4850         return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
4851 }
4852
4853 void
4854 thread_set_eager_preempt(thread_t thread)
4855 {
4856         spl_t x;
4857         processor_t p;
4858         ast_t ast = AST_NONE;
4859
4860         x = splsched();
4861         p = current_processor();
4862
4863         thread_lock(thread);
4864         thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
4865
4866         if (thread == current_thread()) {
4867                 thread_unlock(thread);
4868
4869                 ast = csw_check(p);
4870                 if (ast != AST_NONE) {
4871                         (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
4872                 }
4873         } else {
4874                 p = thread->last_processor;
4875
4876                 if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
4877                         p->active_thread == thread) {
4878                         cause_ast_check(p);
4879                 }
4880
4881                 thread_unlock(thread);
4882         }
4883
4884         splx(x);
4885 }
4886
4887 void
4888 thread_clear_eager_preempt(thread_t thread)
4889 {
4890         spl_t x;
4891
4892         x = splsched();
4893         thread_lock(thread);
4894
4895         thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
4896
4897         thread_unlock(thread);
4898         splx(x);
4899 }
4900 /*
4901  * Scheduling statistics
4902  */
4903 void
4904 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
4905 {
4906         struct processor_sched_statistics *stats;
4907         boolean_t to_realtime = FALSE;
4908
4909         stats = &processor->processor_data.sched_stats;
4910         stats->csw_count++;
4911
4912         if (otherpri >= BASEPRI_REALTIME) {
4913                 stats->rt_sched_count++;
4914                 to_realtime = TRUE;
4915         }
4916
4917         if ((reasons & AST_PREEMPT) != 0) {
4918                 stats->preempt_count++;
4919
4920                 if (selfpri >= BASEPRI_REALTIME) {
4921                         stats->preempted_rt_count++;
4922                 }
4923
4924                 if (to_realtime) {
4925                         stats->preempted_by_rt_count++;
4926                 }
4927
4928         }
4929 }
4930
4931 void
4932 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
4933 {
4934         uint64_t timestamp = mach_absolute_time();
4935
4936         stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
4937         stats->last_change_timestamp = timestamp;
4938 }
4939
4940 /*
4941  *     For calls from assembly code
4942  */
4943 #undef thread_wakeup
4944 void
4945 thread_wakeup(
4946        event_t         x);
4947
4948 void
4949 thread_wakeup(
4950        event_t         x)
4951 {
4952        thread_wakeup_with_result(x, THREAD_AWAKENED);
4953 }
4954
4955 boolean_t
4956 preemption_enabled(void)
4957 {
4958         return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
4959 }
4960
4961 #if     DEBUG
4962 static boolean_t
4963 thread_runnable(
4964         thread_t        thread)
4965 {
4966         return ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN);
4967 }
4968 #endif  /* DEBUG */
4969
4970 static void
4971 sched_timer_deadline_tracking_init(void) {
4972         nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
4973         nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
4974 }