osfmk/kern/syscall_subr.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58
  59 #include <mach/boolean.h>
  60 #include <mach/thread_switch.h>
  61 #include <ipc/ipc_port.h>
  62 #include <ipc/ipc_space.h>
  63 #include <kern/counters.h>
  64 #include <kern/ipc_kobject.h>
  65 #include <kern/processor.h>
  66 #include <kern/sched.h>
  67 #include <kern/sched_prim.h>
  68 #include <kern/spl.h>
  69 #include <kern/task.h>
  70 #include <kern/thread.h>
  71 #include <kern/policy_internal.h>
  72
  73 #include <mach/policy.h>
  74
  75 #include <kern/syscall_subr.h>
  76 #include <mach/mach_host_server.h>
  77 #include <mach/mach_syscalls.h>
  78 #include <sys/kdebug.h>
  79
  80 #ifdef MACH_BSD
  81 extern void workqueue_thread_yielded(void);
  82 extern sched_call_t workqueue_get_sched_callback(void);
  83 #endif /* MACH_BSD */
  84
  85
  86 /* Called from commpage to take a delayed preemption when exiting
  87  * the "Preemption Free Zone" (PFZ).
  88  */
  89 kern_return_t
  90 pfz_exit(
  91 __unused        struct pfz_exit_args *args)
  92 {
  93         /* For now, nothing special to do.  We'll pick up the ASTs on kernel exit. */
  94
  95         return (KERN_SUCCESS);
  96 }
  97
  98
  99 /*
 100  *      swtch and swtch_pri both attempt to context switch (logic in
 101  *      thread_block no-ops the context switch if nothing would happen).
 102  *      A boolean is returned that indicates whether there is anything
 103  *      else runnable.  That's no excuse to spin, though.
 104  */
 105
 106 static void
 107 swtch_continue(void)
 108 {
 109         processor_t     myprocessor;
 110     boolean_t                           result;
 111
 112     disable_preemption();
 113         myprocessor = current_processor();
 114         result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0;
 115         enable_preemption();
 116
 117         thread_syscall_return(result);
 118         /*NOTREACHED*/
 119 }
 120
 121 boolean_t
 122 swtch(
 123         __unused struct swtch_args *args)
 124 {
 125         processor_t     myprocessor;
 126         boolean_t                               result;
 127
 128         disable_preemption();
 129         myprocessor = current_processor();
 130         if (SCHED(processor_queue_empty)(myprocessor) &&        rt_runq.count == 0) {
 131                 mp_enable_preemption();
 132
 133                 return (FALSE);
 134         }
 135         enable_preemption();
 136
 137         counter(c_swtch_block++);
 138
 139         thread_block_reason((thread_continue_t)swtch_continue, NULL, AST_YIELD);
 140
 141         disable_preemption();
 142         myprocessor = current_processor();
 143         result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0;
 144         enable_preemption();
 145
 146         return (result);
 147 }
 148
 149 static void
 150 swtch_pri_continue(void)
 151 {
 152         processor_t     myprocessor;
 153     boolean_t                           result;
 154
 155         thread_depress_abort_internal(current_thread());
 156
 157     disable_preemption();
 158         myprocessor = current_processor();
 159         result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0;
 160         mp_enable_preemption();
 161
 162         thread_syscall_return(result);
 163         /*NOTREACHED*/
 164 }
 165
 166 boolean_t
 167 swtch_pri(
 168 __unused        struct swtch_pri_args *args)
 169 {
 170         processor_t     myprocessor;
 171         boolean_t                               result;
 172
 173         disable_preemption();
 174         myprocessor = current_processor();
 175         if (SCHED(processor_queue_empty)(myprocessor) && rt_runq.count == 0) {
 176                 mp_enable_preemption();
 177
 178                 return (FALSE);
 179         }
 180         enable_preemption();
 181
 182         counter(c_swtch_pri_block++);
 183
 184         thread_depress_abstime(thread_depress_time);
 185
 186         thread_block_reason((thread_continue_t)swtch_pri_continue, NULL, AST_YIELD);
 187
 188         thread_depress_abort_internal(current_thread());
 189
 190         disable_preemption();
 191         myprocessor = current_processor();
 192         result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0;
 193         enable_preemption();
 194
 195         return (result);
 196 }
 197
 198 static boolean_t
 199 thread_switch_disable_workqueue_sched_callback(void)
 200 {
 201         sched_call_t callback = workqueue_get_sched_callback();
 202         return thread_disable_sched_call(current_thread(), callback) != NULL;
 203 }
 204
 205 static void
 206 thread_switch_enable_workqueue_sched_callback(void)
 207 {
 208         sched_call_t callback = workqueue_get_sched_callback();
 209         thread_reenable_sched_call(current_thread(), callback);
 210 }
 211
 212 static void
 213 thread_switch_continue(void)
 214 {
 215         thread_t        self = current_thread();
 216         int                                     option = self->saved.swtch.option;
 217         boolean_t                       reenable_workq_callback = self->saved.swtch.reenable_workq_callback;
 218
 219
 220         if (option == SWITCH_OPTION_DEPRESS || option == SWITCH_OPTION_OSLOCK_DEPRESS)
 221                 thread_depress_abort_internal(self);
 222
 223         if (reenable_workq_callback)
 224                 thread_switch_enable_workqueue_sched_callback();
 225
 226         thread_syscall_return(KERN_SUCCESS);
 227         /*NOTREACHED*/
 228 }
 229
 230 /*
 231  *      thread_switch:
 232  *
 233  *      Context switch.  User may supply thread hint.
 234  */
 235 kern_return_t
 236 thread_switch(
 237         struct thread_switch_args *args)
 238 {
 239         thread_t                        thread = THREAD_NULL;
 240         thread_t                        self = current_thread();
 241         mach_port_name_t                thread_name = args->thread_name;
 242         int                                             option = args->option;
 243         mach_msg_timeout_t              option_time = args->option_time;
 244         uint32_t                                scale_factor = NSEC_PER_MSEC;
 245         boolean_t                               reenable_workq_callback = FALSE;
 246         boolean_t                               depress_option = FALSE;
 247         boolean_t                               wait_option = FALSE;
 248
 249     /*
 250      *  Validate and process option.
 251      */
 252     switch (option) {
 253
 254         case SWITCH_OPTION_NONE:
 255                 workqueue_thread_yielded();
 256                 break;
 257         case SWITCH_OPTION_WAIT:
 258                 wait_option = TRUE;
 259                 workqueue_thread_yielded();
 260                 break;
 261         case SWITCH_OPTION_DEPRESS:
 262                 depress_option = TRUE;
 263                 workqueue_thread_yielded();
 264                 break;
 265         case SWITCH_OPTION_DISPATCH_CONTENTION:
 266                 scale_factor = NSEC_PER_USEC;
 267                 wait_option = TRUE;
 268                 if (thread_switch_disable_workqueue_sched_callback())
 269                         reenable_workq_callback = TRUE;
 270                 break;
 271         case SWITCH_OPTION_OSLOCK_DEPRESS:
 272                 depress_option = TRUE;
 273                 if (thread_switch_disable_workqueue_sched_callback())
 274                         reenable_workq_callback = TRUE;
 275                 break;
 276         case SWITCH_OPTION_OSLOCK_WAIT:
 277                 wait_option = TRUE;
 278                 if (thread_switch_disable_workqueue_sched_callback())
 279                         reenable_workq_callback = TRUE;
 280                 break;
 281         default:
 282             return (KERN_INVALID_ARGUMENT);
 283     }
 284
 285         /*
 286          * Translate the port name if supplied.
 287          */
 288         if (thread_name != MACH_PORT_NULL) {
 289                 ipc_port_t port;
 290
 291                 if (ipc_port_translate_send(self->task->itk_space,
 292                                             thread_name, &port) == KERN_SUCCESS) {
 293                         ip_reference(port);
 294                         ip_unlock(port);
 295
 296                         thread = convert_port_to_thread(port);
 297                         ip_release(port);
 298
 299                         if (thread == self) {
 300                                 thread_deallocate(thread);
 301                                 thread = THREAD_NULL;
 302                         }
 303                 }
 304         }
 305
 306         if (option == SWITCH_OPTION_OSLOCK_DEPRESS || option == SWITCH_OPTION_OSLOCK_WAIT) {
 307                 if (thread != THREAD_NULL) {
 308
 309                         if (thread->task != self->task) {
 310                                 /*
 311                                  * OSLock boosting only applies to other threads
 312                                  * in your same task (even if you have a port for
 313                                  * a thread in another task)
 314                                  */
 315
 316                                 thread_deallocate(thread);
 317                                 thread = THREAD_NULL;
 318                         } else {
 319                                 /*
 320                                  * Attempt to kick the lock owner up to our same IO throttling tier.
 321                                  * If the thread is currently blocked in throttle_lowpri_io(),
 322                                  * it will immediately break out.
 323                                  *
 324                                  * TODO: SFI break out?
 325                                  */
 326                                 int new_policy = proc_get_effective_thread_policy(self, TASK_POLICY_IO);
 327
 328                                 set_thread_iotier_override(thread, new_policy);
 329                         }
 330                 }
 331         }
 332
 333         /*
 334          * Try to handoff if supplied.
 335          */
 336         if (thread != THREAD_NULL) {
 337                 spl_t s = splsched();
 338
 339                 /* This may return a different thread if the target is pushing on something */
 340                 thread_t pulled_thread = thread_run_queue_remove_for_handoff(thread);
 341
 342                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_THREAD_SWITCH)|DBG_FUNC_NONE,
 343                                       thread_tid(thread), thread->state,
 344                                       pulled_thread ? TRUE : FALSE, 0, 0);
 345
 346                 if (pulled_thread != THREAD_NULL) {
 347                         /* We can't be dropping the last ref here */
 348                         thread_deallocate_safe(thread);
 349
 350                         if (wait_option)
 351                                 assert_wait_timeout((event_t)assert_wait_timeout, THREAD_ABORTSAFE,
 352                                                     option_time, scale_factor);
 353                         else if (depress_option)
 354                                 thread_depress_ms(option_time);
 355
 356                         self->saved.swtch.option = option;
 357                         self->saved.swtch.reenable_workq_callback = reenable_workq_callback;
 358
 359                         thread_run(self, (thread_continue_t)thread_switch_continue, NULL, pulled_thread);
 360                         /* NOTREACHED */
 361                         panic("returned from thread_run!");
 362                 }
 363
 364                 splx(s);
 365
 366                 thread_deallocate(thread);
 367         }
 368
 369         if (wait_option)
 370                 assert_wait_timeout((event_t)assert_wait_timeout, THREAD_ABORTSAFE, option_time, scale_factor);
 371         else if (depress_option)
 372                 thread_depress_ms(option_time);
 373
 374         self->saved.swtch.option = option;
 375         self->saved.swtch.reenable_workq_callback = reenable_workq_callback;
 376
 377         thread_block_reason((thread_continue_t)thread_switch_continue, NULL, AST_YIELD);
 378
 379         if (depress_option)
 380                 thread_depress_abort_internal(self);
 381
 382         if (reenable_workq_callback)
 383                 thread_switch_enable_workqueue_sched_callback();
 384
 385     return (KERN_SUCCESS);
 386 }
 387
 388 /* Returns a +1 thread reference */
 389 thread_t
 390 port_name_to_thread_for_ulock(mach_port_name_t thread_name)
 391 {
 392         thread_t thread = THREAD_NULL;
 393         thread_t self = current_thread();
 394
 395         /*
 396          * Translate the port name if supplied.
 397          */
 398         if (thread_name != MACH_PORT_NULL) {
 399                 ipc_port_t port;
 400
 401                 if (ipc_port_translate_send(self->task->itk_space,
 402                                             thread_name, &port) == KERN_SUCCESS) {
 403                         ip_reference(port);
 404                         ip_unlock(port);
 405
 406                         thread = convert_port_to_thread(port);
 407                         ip_release(port);
 408
 409                         if (thread == THREAD_NULL) {
 410                                 return thread;
 411                         }
 412
 413                         if ((thread == self) || (thread->task != self->task)) {
 414                                 thread_deallocate(thread);
 415                                 thread = THREAD_NULL;
 416                         }
 417                 }
 418         }
 419
 420         return thread;
 421 }
 422
 423 /* This function is called after an assert_wait(), therefore it must not
 424  * cause another wait until after the thread_run() or thread_block()
 425  *
 426  * Consumes a ref on thread
 427  */
 428 wait_result_t
 429 thread_handoff(thread_t thread)
 430 {
 431         thread_t deallocate_thread = THREAD_NULL;
 432         thread_t self = current_thread();
 433
 434         /*
 435          * Try to handoff if supplied.
 436          */
 437         if (thread != THREAD_NULL) {
 438                 spl_t s = splsched();
 439
 440                 thread_t pulled_thread = thread_run_queue_remove_for_handoff(thread);
 441
 442                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_THREAD_SWITCH)|DBG_FUNC_NONE,
 443                                       thread_tid(thread), thread->state,
 444                                       pulled_thread ? TRUE : FALSE, 0, 0);
 445
 446                 if (pulled_thread != THREAD_NULL) {
 447                         /* We can't be dropping the last ref here */
 448                         thread_deallocate_safe(thread);
 449
 450                         int result = thread_run(self, THREAD_CONTINUE_NULL, NULL, pulled_thread);
 451
 452                         splx(s);
 453                         return result;
 454                 }
 455
 456                 splx(s);
 457
 458                 deallocate_thread = thread;
 459                 thread = THREAD_NULL;
 460         }
 461
 462         int result = thread_block(THREAD_CONTINUE_NULL);
 463         if (deallocate_thread != THREAD_NULL) {
 464                 thread_deallocate(deallocate_thread);
 465         }
 466
 467         return result;
 468 }
 469
 470 /*
 471  * Depress thread's priority to lowest possible for the specified interval,
 472  * with a value of zero resulting in no timeout being scheduled.
 473  */
 474 void
 475 thread_depress_abstime(
 476         uint64_t                                interval)
 477 {
 478         thread_t                self = current_thread();
 479         uint64_t                                deadline;
 480     spl_t                                       s;
 481
 482     s = splsched();
 483     thread_lock(self);
 484         if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) {
 485                 processor_t             myprocessor = self->last_processor;
 486
 487                 self->sched_pri = DEPRESSPRI;
 488
 489                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
 490                                       (uintptr_t)thread_tid(self),
 491                                       self->base_pri,
 492                                       self->sched_pri,
 493                                       0, /* eventually, 'reason' */
 494                                       0);
 495
 496                 myprocessor->current_pri = self->sched_pri;
 497                 self->sched_flags |= TH_SFLAG_DEPRESS;
 498
 499                 if (interval != 0) {
 500                         clock_absolutetime_interval_to_deadline(interval, &deadline);
 501                         if (!timer_call_enter(&self->depress_timer, deadline, TIMER_CALL_USER_CRITICAL))
 502                                 self->depress_timer_active++;
 503                 }
 504         }
 505         thread_unlock(self);
 506     splx(s);
 507 }
 508
 509 void
 510 thread_depress_ms(
 511         mach_msg_timeout_t              interval)
 512 {
 513         uint64_t                abstime;
 514
 515         clock_interval_to_absolutetime_interval(
 516                                                         interval, NSEC_PER_MSEC, &abstime);
 517         thread_depress_abstime(abstime);
 518 }
 519
 520 /*
 521  *      Priority depression expiration.
 522  */
 523 void
 524 thread_depress_expire(
 525         void                    *p0,
 526         __unused void   *p1)
 527 {
 528         thread_t                thread = p0;
 529     spl_t                       s;
 530
 531     s = splsched();
 532     thread_lock(thread);
 533         if (--thread->depress_timer_active == 0) {
 534                 thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
 535                 thread_recompute_sched_pri(thread, FALSE);
 536         }
 537     thread_unlock(thread);
 538     splx(s);
 539 }
 540
 541 /*
 542  *      Prematurely abort priority depression if there is one.
 543  */
 544 kern_return_t
 545 thread_depress_abort_internal(
 546         thread_t                                thread)
 547 {
 548     kern_return_t                       result = KERN_NOT_DEPRESSED;
 549     spl_t                                       s;
 550
 551     s = splsched();
 552     thread_lock(thread);
 553         if (!(thread->sched_flags & TH_SFLAG_POLLDEPRESS)) {
 554                 if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
 555                         thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
 556                         thread_recompute_sched_pri(thread, FALSE);
 557                         result = KERN_SUCCESS;
 558                 }
 559
 560                 if (timer_call_cancel(&thread->depress_timer))
 561                         thread->depress_timer_active--;
 562         }
 563         thread_unlock(thread);
 564     splx(s);
 565
 566     return (result);
 567 }
 568
 569 void
 570 thread_poll_yield(
 571         thread_t                self)
 572 {
 573         spl_t                   s;
 574
 575         assert(self == current_thread());
 576
 577         s = splsched();
 578         if (self->sched_mode == TH_MODE_FIXED) {
 579                 uint64_t                        total_computation, abstime;
 580
 581                 abstime = mach_absolute_time();
 582                 total_computation = abstime - self->computation_epoch;
 583                 total_computation += self->computation_metered;
 584                 if (total_computation >= max_poll_computation) {
 585                         processor_t             myprocessor = current_processor();
 586                         ast_t                   preempt;
 587
 588                         thread_lock(self);
 589                         if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) {
 590                                 self->sched_pri = DEPRESSPRI;
 591
 592                                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
 593                                                       (uintptr_t)thread_tid(self),
 594                                                       self->base_pri,
 595                                                       self->sched_pri,
 596                                                       0, /* eventually, 'reason' */
 597                                                       0);
 598
 599                                 myprocessor->current_pri = self->sched_pri;
 600                         }
 601                         self->computation_epoch = abstime;
 602                         self->computation_metered = 0;
 603                         self->sched_flags |= TH_SFLAG_POLLDEPRESS;
 604
 605                         abstime += (total_computation >> sched_poll_yield_shift);
 606                         if (!timer_call_enter(&self->depress_timer, abstime, TIMER_CALL_USER_CRITICAL))
 607                                 self->depress_timer_active++;
 608
 609                         if ((preempt = csw_check(myprocessor, AST_NONE)) != AST_NONE)
 610                                 ast_on(preempt);
 611
 612                         thread_unlock(self);
 613                 }
 614         }
 615         splx(s);
 616 }
 617
 618
 619 void
 620 thread_yield_internal(
 621         mach_msg_timeout_t      ms)
 622 {
 623         processor_t     myprocessor;
 624
 625         disable_preemption();
 626         myprocessor = current_processor();
 627         if (SCHED(processor_queue_empty)(myprocessor) && rt_runq.count == 0) {
 628                 mp_enable_preemption();
 629
 630                 return;
 631         }
 632         enable_preemption();
 633
 634         thread_depress_ms(ms);
 635
 636         thread_block_reason(THREAD_CONTINUE_NULL, NULL, AST_YIELD);
 637
 638         thread_depress_abort_internal(current_thread());
 639 }
 640
 641 /*
 642  * This yields to a possible non-urgent preemption pending on the current processor.
 643  *
 644  * This is useful when doing a long computation in the kernel without returning to userspace.
 645  *
 646  * As opposed to other yielding mechanisms, this does not drop the priority of the current thread.
 647  */
 648 void
 649 thread_yield_to_preemption()
 650 {
 651         /*
 652          * ast_pending() should ideally be called with interrupts disabled, but
 653          * the check here is fine because csw_check() will do the right thing.
 654          */
 655         ast_t *pending_ast = ast_pending();
 656         ast_t ast = AST_NONE;
 657         processor_t p;
 658
 659         if (*pending_ast & AST_PREEMPT) {
 660                 thread_t self = current_thread();
 661
 662                 spl_t s = splsched();
 663
 664                 p = current_processor();
 665                 thread_lock(self);
 666                 ast = csw_check(p, AST_YIELD);
 667                 ast_on(ast);
 668                 thread_unlock(self);
 669
 670                 if (ast != AST_NONE) {
 671                         (void)thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
 672                 }
 673
 674                 splx(s);
 675         }
 676 }
 677