osfmk/kern/syscall_subr.c

   1 /*
   2  * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 #include <mach/boolean.h>
  58 #include <mach/thread_switch.h>
  59 #include <ipc/ipc_port.h>
  60 #include <ipc/ipc_space.h>
  61 #include <kern/counters.h>
  62 #include <kern/ipc_kobject.h>
  63 #include <kern/processor.h>
  64 #include <kern/sched.h>
  65 #include <kern/sched_prim.h>
  66 #include <kern/spl.h>
  67 #include <kern/task.h>
  68 #include <kern/thread.h>
  69 #include <kern/policy_internal.h>
  70
  71 #include <mach/policy.h>
  72
  73 #include <kern/syscall_subr.h>
  74 #include <mach/mach_host_server.h>
  75 #include <mach/mach_syscalls.h>
  76 #include <sys/kdebug.h>
  77 #include <kern/ast.h>
  78
  79 static void thread_depress_abstime(uint64_t interval);
  80 static void thread_depress_ms(mach_msg_timeout_t interval);
  81
  82 /* Called from commpage to take a delayed preemption when exiting
  83  * the "Preemption Free Zone" (PFZ).
  84  */
  85 kern_return_t
  86 pfz_exit(
  87         __unused        struct pfz_exit_args *args)
  88 {
  89         /* For now, nothing special to do.  We'll pick up the ASTs on kernel exit. */
  90
  91         return KERN_SUCCESS;
  92 }
  93
  94
  95 /*
  96  *      swtch and swtch_pri both attempt to context switch (logic in
  97  *      thread_block no-ops the context switch if nothing would happen).
  98  *      A boolean is returned that indicates whether there is anything
  99  *      else runnable.  That's no excuse to spin, though.
 100  */
 101
 102 static void
 103 swtch_continue(void)
 104 {
 105         processor_t     myprocessor;
 106         boolean_t       result;
 107
 108         disable_preemption();
 109         myprocessor = current_processor();
 110         result = SCHED(thread_should_yield)(myprocessor, current_thread());
 111         enable_preemption();
 112
 113         ml_delay_on_yield();
 114
 115         thread_syscall_return(result);
 116         /*NOTREACHED*/
 117 }
 118
 119 boolean_t
 120 swtch(
 121         __unused struct swtch_args *args)
 122 {
 123         processor_t     myprocessor;
 124
 125         disable_preemption();
 126         myprocessor = current_processor();
 127         if (!SCHED(thread_should_yield)(myprocessor, current_thread())) {
 128                 mp_enable_preemption();
 129
 130                 return FALSE;
 131         }
 132         enable_preemption();
 133
 134         counter(c_swtch_block++);
 135
 136         thread_yield_with_continuation((thread_continue_t)swtch_continue, NULL);
 137 }
 138
 139 static void
 140 swtch_pri_continue(void)
 141 {
 142         processor_t     myprocessor;
 143         boolean_t       result;
 144
 145         thread_depress_abort(current_thread());
 146
 147         disable_preemption();
 148         myprocessor = current_processor();
 149         result = SCHED(thread_should_yield)(myprocessor, current_thread());
 150         mp_enable_preemption();
 151
 152         ml_delay_on_yield();
 153
 154         thread_syscall_return(result);
 155         /*NOTREACHED*/
 156 }
 157
 158 boolean_t
 159 swtch_pri(
 160         __unused        struct swtch_pri_args *args)
 161 {
 162         processor_t     myprocessor;
 163
 164         disable_preemption();
 165         myprocessor = current_processor();
 166         if (!SCHED(thread_should_yield)(myprocessor, current_thread())) {
 167                 mp_enable_preemption();
 168
 169                 return FALSE;
 170         }
 171         enable_preemption();
 172
 173         counter(c_swtch_pri_block++);
 174
 175         thread_depress_abstime(thread_depress_time);
 176
 177         thread_yield_with_continuation((thread_continue_t)swtch_pri_continue, NULL);
 178 }
 179
 180 static void
 181 thread_switch_continue(void *parameter, __unused int ret)
 182 {
 183         thread_t self = current_thread();
 184         int option = (int)(intptr_t)parameter;
 185
 186         if (option == SWITCH_OPTION_DEPRESS || option == SWITCH_OPTION_OSLOCK_DEPRESS) {
 187                 thread_depress_abort(self);
 188         }
 189
 190         ml_delay_on_yield();
 191
 192         thread_syscall_return(KERN_SUCCESS);
 193         /*NOTREACHED*/
 194 }
 195
 196 /*
 197  *      thread_switch:
 198  *
 199  *      Context switch.  User may supply thread hint.
 200  */
 201 kern_return_t
 202 thread_switch(
 203         struct thread_switch_args *args)
 204 {
 205         thread_t                        thread = THREAD_NULL;
 206         thread_t                        self = current_thread();
 207         mach_port_name_t                thread_name = args->thread_name;
 208         int                             option = args->option;
 209         mach_msg_timeout_t              option_time = args->option_time;
 210         uint32_t                        scale_factor = NSEC_PER_MSEC;
 211         boolean_t                       depress_option = FALSE;
 212         boolean_t                       wait_option = FALSE;
 213         wait_interrupt_t                interruptible = THREAD_ABORTSAFE;
 214         port_to_thread_options_t        ptt_options = PORT_TO_THREAD_NOT_CURRENT_THREAD;
 215
 216         /*
 217          *      Validate and process option.
 218          *
 219          * OSLock boosting only applies to other threads
 220          * in your same task (even if you have a port for
 221          * a thread in another task)
 222          */
 223         switch (option) {
 224         case SWITCH_OPTION_NONE:
 225                 break;
 226         case SWITCH_OPTION_WAIT:
 227                 wait_option = TRUE;
 228                 break;
 229         case SWITCH_OPTION_DEPRESS:
 230                 depress_option = TRUE;
 231                 break;
 232         case SWITCH_OPTION_DISPATCH_CONTENTION:
 233                 scale_factor = NSEC_PER_USEC;
 234                 wait_option = TRUE;
 235                 interruptible |= THREAD_WAIT_NOREPORT;
 236                 break;
 237         case SWITCH_OPTION_OSLOCK_DEPRESS:
 238                 depress_option = TRUE;
 239                 interruptible |= THREAD_WAIT_NOREPORT;
 240                 ptt_options |= PORT_TO_THREAD_IN_CURRENT_TASK;
 241                 break;
 242         case SWITCH_OPTION_OSLOCK_WAIT:
 243                 wait_option = TRUE;
 244                 interruptible |= THREAD_WAIT_NOREPORT;
 245                 ptt_options |= PORT_TO_THREAD_IN_CURRENT_TASK;
 246                 break;
 247         default:
 248                 return KERN_INVALID_ARGUMENT;
 249         }
 250
 251         /*
 252          * Translate the port name if supplied.
 253          */
 254         if (thread_name != MACH_PORT_NULL) {
 255                 thread = port_name_to_thread(thread_name, ptt_options);
 256         }
 257
 258         if (option == SWITCH_OPTION_OSLOCK_DEPRESS || option == SWITCH_OPTION_OSLOCK_WAIT) {
 259                 if (thread != THREAD_NULL) {
 260                         /*
 261                          * Attempt to kick the lock owner up to our same IO throttling tier.
 262                          * If the thread is currently blocked in throttle_lowpri_io(),
 263                          * it will immediately break out.
 264                          *
 265                          * TODO: SFI break out?
 266                          */
 267                         int new_policy = proc_get_effective_thread_policy(self, TASK_POLICY_IO);
 268
 269                         set_thread_iotier_override(thread, new_policy);
 270                 }
 271         }
 272
 273         /*
 274          * Try to handoff if supplied.
 275          */
 276         if (thread != THREAD_NULL) {
 277                 spl_t s = splsched();
 278
 279                 /* This may return a different thread if the target is pushing on something */
 280                 thread_t pulled_thread = thread_run_queue_remove_for_handoff(thread);
 281
 282                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SWITCH) | DBG_FUNC_NONE,
 283                     thread_tid(thread), thread->state,
 284                     pulled_thread ? TRUE : FALSE, 0, 0);
 285
 286                 if (pulled_thread != THREAD_NULL) {
 287                         /* We can't be dropping the last ref here */
 288                         thread_deallocate_safe(thread);
 289
 290                         if (wait_option) {
 291                                 assert_wait_timeout((event_t)assert_wait_timeout, interruptible,
 292                                     option_time, scale_factor);
 293                         } else if (depress_option) {
 294                                 thread_depress_ms(option_time);
 295                         }
 296
 297                         thread_run(self, thread_switch_continue, (void *)(intptr_t)option, pulled_thread);
 298                         __builtin_unreachable();
 299                 }
 300
 301                 splx(s);
 302
 303                 thread_deallocate(thread);
 304         }
 305
 306         if (wait_option) {
 307                 assert_wait_timeout((event_t)assert_wait_timeout, interruptible, option_time, scale_factor);
 308         } else {
 309                 disable_preemption();
 310                 bool should_yield = SCHED(thread_should_yield)(current_processor(), current_thread());
 311                 enable_preemption();
 312
 313                 if (should_yield == false) {
 314                         /* Early-return if yielding to the scheduler will not be beneficial */
 315                         return KERN_SUCCESS;
 316                 }
 317
 318                 if (depress_option) {
 319                         thread_depress_ms(option_time);
 320                 }
 321         }
 322
 323         thread_yield_with_continuation(thread_switch_continue, (void *)(intptr_t)option);
 324         __builtin_unreachable();
 325 }
 326
 327 void
 328 thread_yield_with_continuation(
 329         thread_continue_t       continuation,
 330         void                            *parameter)
 331 {
 332         assert(continuation);
 333         thread_block_reason(continuation, parameter, AST_YIELD);
 334         __builtin_unreachable();
 335 }
 336
 337 /* This function is called after an assert_wait(), therefore it must not
 338  * cause another wait until after the thread_run() or thread_block()
 339  *
 340  * Following are the calling convention for thread ref deallocation.
 341  *
 342  * 1) If no continuation is provided, then thread ref is consumed.
 343  * (thread_handoff_deallocate convention).
 344  *
 345  * 2) If continuation is provided with option THREAD_HANDOFF_SETRUN_NEEDED
 346  * then thread ref is always consumed.
 347  *
 348  * 3) If continuation is provided with option THREAD_HANDOFF_NONE then thread
 349  * ref is not consumed and it is upto the continuation to deallocate
 350  * the thread reference.
 351  */
 352 static wait_result_t
 353 thread_handoff_internal(thread_t thread, thread_continue_t continuation,
 354     void *parameter, thread_handoff_option_t option)
 355 {
 356         thread_t self = current_thread();
 357
 358         /*
 359          * Try to handoff if supplied.
 360          */
 361         if (thread != THREAD_NULL) {
 362                 spl_t s = splsched();
 363
 364                 thread_t pulled_thread = thread_prepare_for_handoff(thread, option);
 365
 366                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SWITCH) | DBG_FUNC_NONE,
 367                     thread_tid(thread), thread->state,
 368                     pulled_thread ? TRUE : FALSE, 0, 0);
 369
 370                 /* Deallocate thread ref if needed */
 371                 if (continuation == NULL || (option & THREAD_HANDOFF_SETRUN_NEEDED)) {
 372                         /* Use the safe version of thread deallocate */
 373                         thread_deallocate_safe(thread);
 374                 }
 375
 376                 if (pulled_thread != THREAD_NULL) {
 377                         int result = thread_run(self, continuation, parameter, pulled_thread);
 378
 379                         splx(s);
 380                         return result;
 381                 }
 382
 383                 splx(s);
 384         }
 385
 386         int result = thread_block_parameter(continuation, parameter);
 387         return result;
 388 }
 389
 390 void
 391 thread_handoff_parameter(thread_t thread, thread_continue_t continuation,
 392     void *parameter, thread_handoff_option_t option)
 393 {
 394         thread_handoff_internal(thread, continuation, parameter, option);
 395         panic("NULL continuation passed to %s", __func__);
 396         __builtin_unreachable();
 397 }
 398
 399 wait_result_t
 400 thread_handoff_deallocate(thread_t thread, thread_handoff_option_t option)
 401 {
 402         return thread_handoff_internal(thread, NULL, NULL, option);
 403 }
 404
 405 /*
 406  * Thread depression
 407  *
 408  * This mechanism drops a thread to priority 0 in order for it to yield to
 409  * all other runnnable threads on the system.  It can be canceled or timed out,
 410  * whereupon the thread goes back to where it was.
 411  *
 412  * Note that TH_SFLAG_DEPRESS and TH_SFLAG_POLLDEPRESS are never set at the
 413  * same time.  DEPRESS always defers to POLLDEPRESS.
 414  *
 415  * DEPRESS only lasts across a single thread_block call, and never returns
 416  * to userspace.
 417  * POLLDEPRESS can be active anywhere up until thread termination.
 418  */
 419
 420 /*
 421  * Depress thread's priority to lowest possible for the specified interval,
 422  * with an interval of zero resulting in no timeout being scheduled.
 423  *
 424  * Must block with AST_YIELD afterwards to take effect
 425  */
 426 void
 427 thread_depress_abstime(uint64_t interval)
 428 {
 429         thread_t self = current_thread();
 430
 431         spl_t s = splsched();
 432         thread_lock(self);
 433
 434         assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0);
 435
 436         if ((self->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) {
 437                 self->sched_flags |= TH_SFLAG_DEPRESS;
 438                 thread_recompute_sched_pri(self, SETPRI_LAZY);
 439
 440                 if (interval != 0) {
 441                         uint64_t deadline;
 442
 443                         clock_absolutetime_interval_to_deadline(interval, &deadline);
 444                         if (!timer_call_enter(&self->depress_timer, deadline, TIMER_CALL_USER_CRITICAL)) {
 445                                 self->depress_timer_active++;
 446                         }
 447                 }
 448         }
 449
 450         thread_unlock(self);
 451         splx(s);
 452 }
 453
 454 void
 455 thread_depress_ms(mach_msg_timeout_t interval)
 456 {
 457         uint64_t abstime;
 458
 459         clock_interval_to_absolutetime_interval(interval, NSEC_PER_MSEC, &abstime);
 460         thread_depress_abstime(abstime);
 461 }
 462
 463 /*
 464  *      Priority depression expiration.
 465  */
 466 void
 467 thread_depress_expire(void      *p0,
 468     __unused void      *p1)
 469 {
 470         thread_t thread = (thread_t)p0;
 471
 472         spl_t s = splsched();
 473         thread_lock(thread);
 474
 475         assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
 476
 477         if (--thread->depress_timer_active == 0) {
 478                 thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
 479                 if ((thread->state & TH_RUN) == TH_RUN) {
 480                         thread->last_basepri_change_time = mach_absolute_time();
 481                 }
 482                 thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
 483         }
 484
 485         thread_unlock(thread);
 486         splx(s);
 487 }
 488
 489 /*
 490  * Prematurely abort priority depression if there is one.
 491  */
 492 kern_return_t
 493 thread_depress_abort(thread_t thread)
 494 {
 495         kern_return_t result = KERN_NOT_DEPRESSED;
 496
 497         spl_t s = splsched();
 498         thread_lock(thread);
 499
 500         assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
 501
 502         /*
 503          * User-triggered depress-aborts should not get out
 504          * of the poll-depress, but they should cancel a regular depress.
 505          */
 506         if ((thread->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) {
 507                 result = thread_depress_abort_locked(thread);
 508         }
 509
 510         thread_unlock(thread);
 511         splx(s);
 512
 513         return result;
 514 }
 515
 516 /*
 517  * Prematurely abort priority depression or poll depression if one is active.
 518  * Called with the thread locked.
 519  */
 520 kern_return_t
 521 thread_depress_abort_locked(thread_t thread)
 522 {
 523         if ((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) == 0) {
 524                 return KERN_NOT_DEPRESSED;
 525         }
 526
 527         assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
 528
 529         thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
 530         if ((thread->state & TH_RUN) == TH_RUN) {
 531                 thread->last_basepri_change_time = mach_absolute_time();
 532         }
 533
 534         thread_recompute_sched_pri(thread, SETPRI_LAZY);
 535
 536         if (timer_call_cancel(&thread->depress_timer)) {
 537                 thread->depress_timer_active--;
 538         }
 539
 540         return KERN_SUCCESS;
 541 }
 542
 543 /*
 544  * Invoked as part of a polling operation like a no-timeout port receive
 545  *
 546  * Forces a fixpri thread to yield if it is detected polling without blocking for too long.
 547  */
 548 void
 549 thread_poll_yield(thread_t self)
 550 {
 551         assert(self == current_thread());
 552         assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0);
 553
 554         if (self->sched_mode != TH_MODE_FIXED) {
 555                 return;
 556         }
 557
 558         spl_t s = splsched();
 559
 560         uint64_t abstime = mach_absolute_time();
 561         uint64_t total_computation = abstime -
 562             self->computation_epoch + self->computation_metered;
 563
 564         if (total_computation >= max_poll_computation) {
 565                 thread_lock(self);
 566
 567                 self->computation_epoch   = abstime;
 568                 self->computation_metered = 0;
 569
 570                 uint64_t yield_expiration = abstime +
 571                     (total_computation >> sched_poll_yield_shift);
 572
 573                 if (!timer_call_enter(&self->depress_timer, yield_expiration,
 574                     TIMER_CALL_USER_CRITICAL)) {
 575                         self->depress_timer_active++;
 576                 }
 577
 578                 self->sched_flags |= TH_SFLAG_POLLDEPRESS;
 579                 thread_recompute_sched_pri(self, SETPRI_DEFAULT);
 580
 581                 thread_unlock(self);
 582         }
 583         splx(s);
 584 }
 585
 586 /*
 587  * Kernel-internal interface to yield for a specified period
 588  *
 589  * WARNING: Will still yield to priority 0 even if the thread is holding a contended lock!
 590  */
 591 void
 592 thread_yield_internal(mach_msg_timeout_t ms)
 593 {
 594         thread_t self = current_thread();
 595
 596         assert((self->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
 597
 598         processor_t     myprocessor;
 599
 600         disable_preemption();
 601         myprocessor = current_processor();
 602         if (!SCHED(thread_should_yield)(myprocessor, self)) {
 603                 mp_enable_preemption();
 604
 605                 return;
 606         }
 607         enable_preemption();
 608
 609         thread_depress_ms(ms);
 610
 611         thread_block_reason(THREAD_CONTINUE_NULL, NULL, AST_YIELD);
 612
 613         thread_depress_abort(self);
 614 }
 615
 616 /*
 617  * This yields to a possible non-urgent preemption pending on the current processor.
 618  *
 619  * This is useful when doing a long computation in the kernel without returning to userspace.
 620  *
 621  * As opposed to other yielding mechanisms, this does not drop the priority of the current thread.
 622  */
 623 void
 624 thread_yield_to_preemption()
 625 {
 626         /*
 627          * ast_pending() should ideally be called with interrupts disabled, but
 628          * the check here is fine because csw_check() will do the right thing.
 629          */
 630         ast_t *pending_ast = ast_pending();
 631         ast_t ast = AST_NONE;
 632         processor_t p;
 633
 634         if (*pending_ast & AST_PREEMPT) {
 635                 thread_t self = current_thread();
 636
 637                 spl_t s = splsched();
 638
 639                 p = current_processor();
 640                 thread_lock(self);
 641                 ast = csw_check(self, p, AST_YIELD);
 642                 ast_on(ast);
 643                 thread_unlock(self);
 644
 645                 if (ast != AST_NONE) {
 646                         (void)thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
 647                 }
 648
 649                 splx(s);
 650         }
 651 }