osfmk/kern/syscall_subr.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58
  59 #include <mach/boolean.h>
  60 #include <mach/thread_switch.h>
  61 #include <ipc/ipc_port.h>
  62 #include <ipc/ipc_space.h>
  63 #include <kern/counters.h>
  64 #include <kern/ipc_kobject.h>
  65 #include <kern/processor.h>
  66 #include <kern/sched.h>
  67 #include <kern/sched_prim.h>
  68 #include <kern/spl.h>
  69 #include <kern/task.h>
  70 #include <kern/thread.h>
  71 #include <kern/policy_internal.h>
  72
  73 #include <mach/policy.h>
  74
  75 #include <kern/syscall_subr.h>
  76 #include <mach/mach_host_server.h>
  77 #include <mach/mach_syscalls.h>
  78 #include <sys/kdebug.h>
  79 #include <kern/ast.h>
  80
  81 #ifdef MACH_BSD
  82 extern void workqueue_thread_yielded(void);
  83 extern sched_call_t workqueue_get_sched_callback(void);
  84 #endif /* MACH_BSD */
  85
  86 extern wait_result_t thread_handoff_reason(thread_t thread, ast_t reason);
  87
  88 /* Called from commpage to take a delayed preemption when exiting
  89  * the "Preemption Free Zone" (PFZ).
  90  */
  91 kern_return_t
  92 pfz_exit(
  93 __unused        struct pfz_exit_args *args)
  94 {
  95         /* For now, nothing special to do.  We'll pick up the ASTs on kernel exit. */
  96
  97         return (KERN_SUCCESS);
  98 }
  99
 100
 101 /*
 102  *      swtch and swtch_pri both attempt to context switch (logic in
 103  *      thread_block no-ops the context switch if nothing would happen).
 104  *      A boolean is returned that indicates whether there is anything
 105  *      else runnable.  That's no excuse to spin, though.
 106  */
 107
 108 static void
 109 swtch_continue(void)
 110 {
 111         processor_t     myprocessor;
 112     boolean_t                           result;
 113
 114     disable_preemption();
 115         myprocessor = current_processor();
 116         result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0;
 117         enable_preemption();
 118
 119         thread_syscall_return(result);
 120         /*NOTREACHED*/
 121 }
 122
 123 boolean_t
 124 swtch(
 125         __unused struct swtch_args *args)
 126 {
 127         processor_t     myprocessor;
 128         boolean_t                               result;
 129
 130         disable_preemption();
 131         myprocessor = current_processor();
 132         if (SCHED(processor_queue_empty)(myprocessor) &&        rt_runq.count == 0) {
 133                 mp_enable_preemption();
 134
 135                 return (FALSE);
 136         }
 137         enable_preemption();
 138
 139         counter(c_swtch_block++);
 140
 141         thread_block_reason((thread_continue_t)swtch_continue, NULL, AST_YIELD);
 142
 143         disable_preemption();
 144         myprocessor = current_processor();
 145         result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0;
 146         enable_preemption();
 147
 148         return (result);
 149 }
 150
 151 static void
 152 swtch_pri_continue(void)
 153 {
 154         processor_t     myprocessor;
 155     boolean_t                           result;
 156
 157         thread_depress_abort_internal(current_thread());
 158
 159     disable_preemption();
 160         myprocessor = current_processor();
 161         result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0;
 162         mp_enable_preemption();
 163
 164         thread_syscall_return(result);
 165         /*NOTREACHED*/
 166 }
 167
 168 boolean_t
 169 swtch_pri(
 170 __unused        struct swtch_pri_args *args)
 171 {
 172         processor_t     myprocessor;
 173         boolean_t                               result;
 174
 175         disable_preemption();
 176         myprocessor = current_processor();
 177         if (SCHED(processor_queue_empty)(myprocessor) && rt_runq.count == 0) {
 178                 mp_enable_preemption();
 179
 180                 return (FALSE);
 181         }
 182         enable_preemption();
 183
 184         counter(c_swtch_pri_block++);
 185
 186         thread_depress_abstime(thread_depress_time);
 187
 188         thread_block_reason((thread_continue_t)swtch_pri_continue, NULL, AST_YIELD);
 189
 190         thread_depress_abort_internal(current_thread());
 191
 192         disable_preemption();
 193         myprocessor = current_processor();
 194         result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0;
 195         enable_preemption();
 196
 197         return (result);
 198 }
 199
 200 static boolean_t
 201 thread_switch_disable_workqueue_sched_callback(void)
 202 {
 203         sched_call_t callback = workqueue_get_sched_callback();
 204         return thread_disable_sched_call(current_thread(), callback) != NULL;
 205 }
 206
 207 static void
 208 thread_switch_enable_workqueue_sched_callback(void)
 209 {
 210         sched_call_t callback = workqueue_get_sched_callback();
 211         thread_reenable_sched_call(current_thread(), callback);
 212 }
 213
 214 static void
 215 thread_switch_continue(void)
 216 {
 217         thread_t        self = current_thread();
 218         int                                     option = self->saved.swtch.option;
 219         boolean_t                       reenable_workq_callback = self->saved.swtch.reenable_workq_callback;
 220
 221
 222         if (option == SWITCH_OPTION_DEPRESS || option == SWITCH_OPTION_OSLOCK_DEPRESS)
 223                 thread_depress_abort_internal(self);
 224
 225         if (reenable_workq_callback)
 226                 thread_switch_enable_workqueue_sched_callback();
 227
 228         thread_syscall_return(KERN_SUCCESS);
 229         /*NOTREACHED*/
 230 }
 231
 232 /*
 233  *      thread_switch:
 234  *
 235  *      Context switch.  User may supply thread hint.
 236  */
 237 kern_return_t
 238 thread_switch(
 239         struct thread_switch_args *args)
 240 {
 241         thread_t                        thread = THREAD_NULL;
 242         thread_t                        self = current_thread();
 243         mach_port_name_t                thread_name = args->thread_name;
 244         int                                             option = args->option;
 245         mach_msg_timeout_t              option_time = args->option_time;
 246         uint32_t                                scale_factor = NSEC_PER_MSEC;
 247         boolean_t                               reenable_workq_callback = FALSE;
 248         boolean_t                               depress_option = FALSE;
 249         boolean_t                               wait_option = FALSE;
 250
 251     /*
 252      *  Validate and process option.
 253      */
 254     switch (option) {
 255
 256         case SWITCH_OPTION_NONE:
 257                 workqueue_thread_yielded();
 258                 break;
 259         case SWITCH_OPTION_WAIT:
 260                 wait_option = TRUE;
 261                 workqueue_thread_yielded();
 262                 break;
 263         case SWITCH_OPTION_DEPRESS:
 264                 depress_option = TRUE;
 265                 workqueue_thread_yielded();
 266                 break;
 267         case SWITCH_OPTION_DISPATCH_CONTENTION:
 268                 scale_factor = NSEC_PER_USEC;
 269                 wait_option = TRUE;
 270                 if (thread_switch_disable_workqueue_sched_callback())
 271                         reenable_workq_callback = TRUE;
 272                 break;
 273         case SWITCH_OPTION_OSLOCK_DEPRESS:
 274                 depress_option = TRUE;
 275                 if (thread_switch_disable_workqueue_sched_callback())
 276                         reenable_workq_callback = TRUE;
 277                 break;
 278         case SWITCH_OPTION_OSLOCK_WAIT:
 279                 wait_option = TRUE;
 280                 if (thread_switch_disable_workqueue_sched_callback())
 281                         reenable_workq_callback = TRUE;
 282                 break;
 283         default:
 284             return (KERN_INVALID_ARGUMENT);
 285     }
 286
 287         /*
 288          * Translate the port name if supplied.
 289          */
 290         if (thread_name != MACH_PORT_NULL) {
 291                 ipc_port_t port;
 292
 293                 if (ipc_port_translate_send(self->task->itk_space,
 294                                             thread_name, &port) == KERN_SUCCESS) {
 295                         ip_reference(port);
 296                         ip_unlock(port);
 297
 298                         thread = convert_port_to_thread(port);
 299                         ip_release(port);
 300
 301                         if (thread == self) {
 302                                 thread_deallocate(thread);
 303                                 thread = THREAD_NULL;
 304                         }
 305                 }
 306         }
 307
 308         if (option == SWITCH_OPTION_OSLOCK_DEPRESS || option == SWITCH_OPTION_OSLOCK_WAIT) {
 309                 if (thread != THREAD_NULL) {
 310
 311                         if (thread->task != self->task) {
 312                                 /*
 313                                  * OSLock boosting only applies to other threads
 314                                  * in your same task (even if you have a port for
 315                                  * a thread in another task)
 316                                  */
 317
 318                                 thread_deallocate(thread);
 319                                 thread = THREAD_NULL;
 320                         } else {
 321                                 /*
 322                                  * Attempt to kick the lock owner up to our same IO throttling tier.
 323                                  * If the thread is currently blocked in throttle_lowpri_io(),
 324                                  * it will immediately break out.
 325                                  *
 326                                  * TODO: SFI break out?
 327                                  */
 328                                 int new_policy = proc_get_effective_thread_policy(self, TASK_POLICY_IO);
 329
 330                                 set_thread_iotier_override(thread, new_policy);
 331                         }
 332                 }
 333         }
 334
 335         /*
 336          * Try to handoff if supplied.
 337          */
 338         if (thread != THREAD_NULL) {
 339                 spl_t s = splsched();
 340
 341                 /* This may return a different thread if the target is pushing on something */
 342                 thread_t pulled_thread = thread_run_queue_remove_for_handoff(thread);
 343
 344                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_THREAD_SWITCH)|DBG_FUNC_NONE,
 345                                       thread_tid(thread), thread->state,
 346                                       pulled_thread ? TRUE : FALSE, 0, 0);
 347
 348                 if (pulled_thread != THREAD_NULL) {
 349                         /* We can't be dropping the last ref here */
 350                         thread_deallocate_safe(thread);
 351
 352                         if (wait_option)
 353                                 assert_wait_timeout((event_t)assert_wait_timeout, THREAD_ABORTSAFE,
 354                                                     option_time, scale_factor);
 355                         else if (depress_option)
 356                                 thread_depress_ms(option_time);
 357
 358                         self->saved.swtch.option = option;
 359                         self->saved.swtch.reenable_workq_callback = reenable_workq_callback;
 360
 361                         thread_run(self, (thread_continue_t)thread_switch_continue, NULL, pulled_thread);
 362                         /* NOTREACHED */
 363                         panic("returned from thread_run!");
 364                 }
 365
 366                 splx(s);
 367
 368                 thread_deallocate(thread);
 369         }
 370
 371         if (wait_option)
 372                 assert_wait_timeout((event_t)assert_wait_timeout, THREAD_ABORTSAFE, option_time, scale_factor);
 373         else if (depress_option)
 374                 thread_depress_ms(option_time);
 375
 376         self->saved.swtch.option = option;
 377         self->saved.swtch.reenable_workq_callback = reenable_workq_callback;
 378
 379         thread_block_reason((thread_continue_t)thread_switch_continue, NULL, AST_YIELD);
 380
 381         if (depress_option)
 382                 thread_depress_abort_internal(self);
 383
 384         if (reenable_workq_callback)
 385                 thread_switch_enable_workqueue_sched_callback();
 386
 387     return (KERN_SUCCESS);
 388 }
 389
 390 /* Returns a +1 thread reference */
 391 thread_t
 392 port_name_to_thread_for_ulock(mach_port_name_t thread_name)
 393 {
 394         thread_t thread = THREAD_NULL;
 395         thread_t self = current_thread();
 396
 397         /*
 398          * Translate the port name if supplied.
 399          */
 400         if (thread_name != MACH_PORT_NULL) {
 401                 ipc_port_t port;
 402
 403                 if (ipc_port_translate_send(self->task->itk_space,
 404                                             thread_name, &port) == KERN_SUCCESS) {
 405                         ip_reference(port);
 406                         ip_unlock(port);
 407
 408                         thread = convert_port_to_thread(port);
 409                         ip_release(port);
 410
 411                         if (thread == THREAD_NULL) {
 412                                 return thread;
 413                         }
 414
 415                         if ((thread == self) || (thread->task != self->task)) {
 416                                 thread_deallocate(thread);
 417                                 thread = THREAD_NULL;
 418                         }
 419                 }
 420         }
 421
 422         return thread;
 423 }
 424
 425 /* This function is called after an assert_wait(), therefore it must not
 426  * cause another wait until after the thread_run() or thread_block()
 427  *
 428  * Consumes a ref on thread
 429  */
 430 wait_result_t
 431 thread_handoff(thread_t thread)
 432 {
 433         thread_t deallocate_thread = THREAD_NULL;
 434         thread_t self = current_thread();
 435
 436         /*
 437          * Try to handoff if supplied.
 438          */
 439         if (thread != THREAD_NULL) {
 440                 spl_t s = splsched();
 441
 442                 thread_t pulled_thread = thread_run_queue_remove_for_handoff(thread);
 443
 444                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_THREAD_SWITCH)|DBG_FUNC_NONE,
 445                                       thread_tid(thread), thread->state,
 446                                       pulled_thread ? TRUE : FALSE, 0, 0);
 447
 448                 if (pulled_thread != THREAD_NULL) {
 449                         /* We can't be dropping the last ref here */
 450                         thread_deallocate_safe(thread);
 451
 452                         int result = thread_run(self, THREAD_CONTINUE_NULL, NULL, pulled_thread);
 453
 454                         splx(s);
 455                         return result;
 456                 }
 457
 458                 splx(s);
 459
 460                 deallocate_thread = thread;
 461                 thread = THREAD_NULL;
 462         }
 463
 464         int result = thread_block(THREAD_CONTINUE_NULL);
 465         if (deallocate_thread != THREAD_NULL) {
 466                 thread_deallocate(deallocate_thread);
 467         }
 468
 469         return result;
 470 }
 471
 472 /*
 473  * Depress thread's priority to lowest possible for the specified interval,
 474  * with a value of zero resulting in no timeout being scheduled.
 475  */
 476 void
 477 thread_depress_abstime(
 478         uint64_t                                interval)
 479 {
 480         thread_t                self = current_thread();
 481         uint64_t                                deadline;
 482     spl_t                                       s;
 483
 484     s = splsched();
 485     thread_lock(self);
 486         if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) {
 487                 processor_t             myprocessor = self->last_processor;
 488
 489                 self->sched_pri = DEPRESSPRI;
 490
 491                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
 492                                       (uintptr_t)thread_tid(self),
 493                                       self->base_pri,
 494                                       self->sched_pri,
 495                                       0, /* eventually, 'reason' */
 496                                       0);
 497
 498                 myprocessor->current_pri = self->sched_pri;
 499                 self->sched_flags |= TH_SFLAG_DEPRESS;
 500
 501                 if (interval != 0) {
 502                         clock_absolutetime_interval_to_deadline(interval, &deadline);
 503                         if (!timer_call_enter(&self->depress_timer, deadline, TIMER_CALL_USER_CRITICAL))
 504                                 self->depress_timer_active++;
 505                 }
 506         }
 507         thread_unlock(self);
 508     splx(s);
 509 }
 510
 511 void
 512 thread_depress_ms(
 513         mach_msg_timeout_t              interval)
 514 {
 515         uint64_t                abstime;
 516
 517         clock_interval_to_absolutetime_interval(
 518                                                         interval, NSEC_PER_MSEC, &abstime);
 519         thread_depress_abstime(abstime);
 520 }
 521
 522 /*
 523  *      Priority depression expiration.
 524  */
 525 void
 526 thread_depress_expire(
 527         void                    *p0,
 528         __unused void   *p1)
 529 {
 530         thread_t                thread = p0;
 531     spl_t                       s;
 532
 533     s = splsched();
 534     thread_lock(thread);
 535         if (--thread->depress_timer_active == 0) {
 536                 thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
 537                 thread_recompute_sched_pri(thread, FALSE);
 538         }
 539     thread_unlock(thread);
 540     splx(s);
 541 }
 542
 543 /*
 544  *      Prematurely abort priority depression if there is one.
 545  */
 546 kern_return_t
 547 thread_depress_abort_internal(
 548         thread_t                                thread)
 549 {
 550     kern_return_t                       result = KERN_NOT_DEPRESSED;
 551     spl_t                                       s;
 552
 553     s = splsched();
 554     thread_lock(thread);
 555         if (!(thread->sched_flags & TH_SFLAG_POLLDEPRESS)) {
 556                 if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
 557                         thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
 558                         thread_recompute_sched_pri(thread, FALSE);
 559                         result = KERN_SUCCESS;
 560                 }
 561
 562                 if (timer_call_cancel(&thread->depress_timer))
 563                         thread->depress_timer_active--;
 564         }
 565         thread_unlock(thread);
 566     splx(s);
 567
 568     return (result);
 569 }
 570
 571 void
 572 thread_poll_yield(
 573         thread_t                self)
 574 {
 575         spl_t                   s;
 576
 577         assert(self == current_thread());
 578
 579         s = splsched();
 580         if (self->sched_mode == TH_MODE_FIXED) {
 581                 uint64_t                        total_computation, abstime;
 582
 583                 abstime = mach_absolute_time();
 584                 total_computation = abstime - self->computation_epoch;
 585                 total_computation += self->computation_metered;
 586                 if (total_computation >= max_poll_computation) {
 587                         processor_t             myprocessor = current_processor();
 588                         ast_t                   preempt;
 589
 590                         thread_lock(self);
 591                         if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) {
 592                                 self->sched_pri = DEPRESSPRI;
 593
 594                                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
 595                                                       (uintptr_t)thread_tid(self),
 596                                                       self->base_pri,
 597                                                       self->sched_pri,
 598                                                       0, /* eventually, 'reason' */
 599                                                       0);
 600
 601                                 myprocessor->current_pri = self->sched_pri;
 602                         }
 603                         self->computation_epoch = abstime;
 604                         self->computation_metered = 0;
 605                         self->sched_flags |= TH_SFLAG_POLLDEPRESS;
 606
 607                         abstime += (total_computation >> sched_poll_yield_shift);
 608                         if (!timer_call_enter(&self->depress_timer, abstime, TIMER_CALL_USER_CRITICAL))
 609                                 self->depress_timer_active++;
 610
 611                         if ((preempt = csw_check(myprocessor, AST_NONE)) != AST_NONE)
 612                                 ast_on(preempt);
 613
 614                         thread_unlock(self);
 615                 }
 616         }
 617         splx(s);
 618 }
 619
 620
 621 void
 622 thread_yield_internal(
 623         mach_msg_timeout_t      ms)
 624 {
 625         processor_t     myprocessor;
 626
 627         disable_preemption();
 628         myprocessor = current_processor();
 629         if (SCHED(processor_queue_empty)(myprocessor) && rt_runq.count == 0) {
 630                 mp_enable_preemption();
 631
 632                 return;
 633         }
 634         enable_preemption();
 635
 636         thread_depress_ms(ms);
 637
 638         thread_block_reason(THREAD_CONTINUE_NULL, NULL, AST_YIELD);
 639
 640         thread_depress_abort_internal(current_thread());
 641 }
 642
 643 /*
 644  * This yields to a possible non-urgent preemption pending on the current processor.
 645  *
 646  * This is useful when doing a long computation in the kernel without returning to userspace.
 647  *
 648  * As opposed to other yielding mechanisms, this does not drop the priority of the current thread.
 649  */
 650 void
 651 thread_yield_to_preemption()
 652 {
 653         /*
 654          * ast_pending() should ideally be called with interrupts disabled, but
 655          * the check here is fine because csw_check() will do the right thing.
 656          */
 657         ast_t *pending_ast = ast_pending();
 658         ast_t ast = AST_NONE;
 659         processor_t p;
 660
 661         if (*pending_ast & AST_PREEMPT) {
 662                 thread_t self = current_thread();
 663
 664                 spl_t s = splsched();
 665
 666                 p = current_processor();
 667                 thread_lock(self);
 668                 ast = csw_check(p, AST_YIELD);
 669                 ast_on(ast);
 670                 thread_unlock(self);
 671
 672                 if (ast != AST_NONE) {
 673                         (void)thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
 674                 }
 675
 676                 splx(s);
 677         }
 678 }
 679