osfmk/kern/locks.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 #define ATOMIC_PRIVATE 1
  58 #define LOCK_PRIVATE 1
  59
  60 #include <mach_ldebug.h>
  61 #include <debug.h>
  62
  63 #include <mach/kern_return.h>
  64 #include <mach/mach_host_server.h>
  65 #include <mach_debug/lockgroup_info.h>
  66
  67 #include <kern/locks.h>
  68 #include <kern/misc_protos.h>
  69 #include <kern/kalloc.h>
  70 #include <kern/thread.h>
  71 #include <kern/processor.h>
  72 #include <kern/sched_prim.h>
  73 #include <kern/debug.h>
  74 #include <machine/atomic.h>
  75 #include <machine/machine_cpu.h>
  76 #include <string.h>
  77
  78
  79 #include <sys/kdebug.h>
  80
  81 #if     CONFIG_DTRACE
  82 /*
  83  * We need only enough declarations from the BSD-side to be able to
  84  * test if our probe is active, and to call __dtrace_probe().  Setting
  85  * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
  86  */
  87 #define NEED_DTRACE_DEFS
  88 #include <../bsd/sys/lockstat.h>
  89 #endif
  90
  91 #define LCK_MTX_SLEEP_CODE              0
  92 #define LCK_MTX_SLEEP_DEADLINE_CODE     1
  93 #define LCK_MTX_LCK_WAIT_CODE           2
  94 #define LCK_MTX_UNLCK_WAKEUP_CODE       3
  95
  96 #if MACH_LDEBUG
  97 #define ALIGN_TEST(p,t) do{if((uintptr_t)p&(sizeof(t)-1)) __builtin_trap();}while(0)
  98 #else
  99 #define ALIGN_TEST(p,t) do{}while(0)
 100 #endif
 101
 102 /* Silence the volatile to _Atomic cast warning */
 103 #define ATOMIC_CAST(t,p) ((_Atomic t*)(uintptr_t)(p))
 104
 105 /* Enforce program order of loads and stores. */
 106 #define ordered_load(target, type) \
 107                 __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed)
 108 #define ordered_store(target, type, value) \
 109                 __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed)
 110
 111 #define ordered_load_hw(lock)                   ordered_load(&(lock)->lock_data, uintptr_t)
 112 #define ordered_store_hw(lock, value)   ordered_store(&(lock)->lock_data, uintptr_t, (value))
 113
 114 #define NOINLINE                __attribute__((noinline))
 115
 116
 117 static queue_head_t     lck_grp_queue;
 118 static unsigned int     lck_grp_cnt;
 119
 120 decl_lck_mtx_data(static,lck_grp_lock)
 121 static lck_mtx_ext_t lck_grp_lock_ext;
 122
 123 lck_grp_attr_t  LockDefaultGroupAttr;
 124 lck_grp_t               LockCompatGroup;
 125 lck_attr_t              LockDefaultLckAttr;
 126
 127 /*
 128  * Routine:     lck_mod_init
 129  */
 130
 131 void
 132 lck_mod_init(
 133         void)
 134 {
 135         /*
 136          * Obtain "lcks" options:this currently controls lock statistics
 137          */
 138         if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts)))
 139                 LcksOpts = 0;
 140
 141         queue_init(&lck_grp_queue);
 142
 143         /*
 144          * Need to bootstrap the LockCompatGroup instead of calling lck_grp_init() here. This avoids
 145          * grabbing the lck_grp_lock before it is initialized.
 146          */
 147
 148         bzero(&LockCompatGroup, sizeof(lck_grp_t));
 149         (void) strncpy(LockCompatGroup.lck_grp_name, "Compatibility APIs", LCK_GRP_MAX_NAME);
 150
 151         if (LcksOpts & enaLkStat)
 152                 LockCompatGroup.lck_grp_attr = LCK_GRP_ATTR_STAT;
 153     else
 154                 LockCompatGroup.lck_grp_attr = LCK_ATTR_NONE;
 155
 156         LockCompatGroup.lck_grp_refcnt = 1;
 157
 158         enqueue_tail(&lck_grp_queue, (queue_entry_t)&LockCompatGroup);
 159         lck_grp_cnt = 1;
 160
 161         lck_grp_attr_setdefault(&LockDefaultGroupAttr);
 162         lck_attr_setdefault(&LockDefaultLckAttr);
 163
 164         lck_mtx_init_ext(&lck_grp_lock, &lck_grp_lock_ext, &LockCompatGroup, &LockDefaultLckAttr);
 165
 166 }
 167
 168 /*
 169  * Routine:     lck_grp_attr_alloc_init
 170  */
 171
 172 lck_grp_attr_t  *
 173 lck_grp_attr_alloc_init(
 174         void)
 175 {
 176         lck_grp_attr_t  *attr;
 177
 178         if ((attr = (lck_grp_attr_t *)kalloc(sizeof(lck_grp_attr_t))) != 0)
 179                 lck_grp_attr_setdefault(attr);
 180
 181         return(attr);
 182 }
 183
 184
 185 /*
 186  * Routine:     lck_grp_attr_setdefault
 187  */
 188
 189 void
 190 lck_grp_attr_setdefault(
 191         lck_grp_attr_t  *attr)
 192 {
 193         if (LcksOpts & enaLkStat)
 194                 attr->grp_attr_val = LCK_GRP_ATTR_STAT;
 195         else
 196                 attr->grp_attr_val = 0;
 197 }
 198
 199
 200 /*
 201  * Routine:     lck_grp_attr_setstat
 202  */
 203
 204 void
 205 lck_grp_attr_setstat(
 206         lck_grp_attr_t  *attr)
 207 {
 208         (void)hw_atomic_or(&attr->grp_attr_val, LCK_GRP_ATTR_STAT);
 209 }
 210
 211
 212 /*
 213  * Routine:     lck_grp_attr_free
 214  */
 215
 216 void
 217 lck_grp_attr_free(
 218         lck_grp_attr_t  *attr)
 219 {
 220         kfree(attr, sizeof(lck_grp_attr_t));
 221 }
 222
 223
 224 /*
 225  * Routine: lck_grp_alloc_init
 226  */
 227
 228 lck_grp_t *
 229 lck_grp_alloc_init(
 230         const char*     grp_name,
 231         lck_grp_attr_t  *attr)
 232 {
 233         lck_grp_t       *grp;
 234
 235         if ((grp = (lck_grp_t *)kalloc(sizeof(lck_grp_t))) != 0)
 236                 lck_grp_init(grp, grp_name, attr);
 237
 238         return(grp);
 239 }
 240
 241 /*
 242  * Routine: lck_grp_init
 243  */
 244
 245 void
 246 lck_grp_init(lck_grp_t * grp, const char * grp_name, lck_grp_attr_t * attr)
 247 {
 248         /* make sure locking infrastructure has been initialized */
 249         assert(lck_grp_cnt > 0);
 250
 251         bzero((void *)grp, sizeof(lck_grp_t));
 252
 253         (void)strlcpy(grp->lck_grp_name, grp_name, LCK_GRP_MAX_NAME);
 254
 255         if (attr != LCK_GRP_ATTR_NULL)
 256                 grp->lck_grp_attr = attr->grp_attr_val;
 257         else if (LcksOpts & enaLkStat)
 258                 grp->lck_grp_attr = LCK_GRP_ATTR_STAT;
 259         else
 260                 grp->lck_grp_attr = LCK_ATTR_NONE;
 261
 262         grp->lck_grp_refcnt = 1;
 263
 264         lck_mtx_lock(&lck_grp_lock);
 265         enqueue_tail(&lck_grp_queue, (queue_entry_t)grp);
 266         lck_grp_cnt++;
 267         lck_mtx_unlock(&lck_grp_lock);
 268 }
 269
 270 /*
 271  * Routine:     lck_grp_free
 272  */
 273
 274 void
 275 lck_grp_free(
 276         lck_grp_t       *grp)
 277 {
 278         lck_mtx_lock(&lck_grp_lock);
 279         lck_grp_cnt--;
 280         (void)remque((queue_entry_t)grp);
 281         lck_mtx_unlock(&lck_grp_lock);
 282         lck_grp_deallocate(grp);
 283 }
 284
 285
 286 /*
 287  * Routine:     lck_grp_reference
 288  */
 289
 290 void
 291 lck_grp_reference(
 292         lck_grp_t       *grp)
 293 {
 294         (void)hw_atomic_add(&grp->lck_grp_refcnt, 1);
 295 }
 296
 297
 298 /*
 299  * Routine:     lck_grp_deallocate
 300  */
 301
 302 void
 303 lck_grp_deallocate(
 304         lck_grp_t       *grp)
 305 {
 306         if (hw_atomic_sub(&grp->lck_grp_refcnt, 1) == 0)
 307                 kfree(grp, sizeof(lck_grp_t));
 308 }
 309
 310 /*
 311  * Routine:     lck_grp_lckcnt_incr
 312  */
 313
 314 void
 315 lck_grp_lckcnt_incr(
 316         lck_grp_t       *grp,
 317         lck_type_t      lck_type)
 318 {
 319         unsigned int    *lckcnt;
 320
 321         switch (lck_type) {
 322         case LCK_TYPE_SPIN:
 323                 lckcnt = &grp->lck_grp_spincnt;
 324                 break;
 325         case LCK_TYPE_MTX:
 326                 lckcnt = &grp->lck_grp_mtxcnt;
 327                 break;
 328         case LCK_TYPE_RW:
 329                 lckcnt = &grp->lck_grp_rwcnt;
 330                 break;
 331         default:
 332                 return panic("lck_grp_lckcnt_incr(): invalid lock type: %d\n", lck_type);
 333         }
 334
 335         (void)hw_atomic_add(lckcnt, 1);
 336 }
 337
 338 /*
 339  * Routine:     lck_grp_lckcnt_decr
 340  */
 341
 342 void
 343 lck_grp_lckcnt_decr(
 344         lck_grp_t       *grp,
 345         lck_type_t      lck_type)
 346 {
 347         unsigned int    *lckcnt;
 348         int             updated;
 349
 350         switch (lck_type) {
 351         case LCK_TYPE_SPIN:
 352                 lckcnt = &grp->lck_grp_spincnt;
 353                 break;
 354         case LCK_TYPE_MTX:
 355                 lckcnt = &grp->lck_grp_mtxcnt;
 356                 break;
 357         case LCK_TYPE_RW:
 358                 lckcnt = &grp->lck_grp_rwcnt;
 359                 break;
 360         default:
 361                 panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type);
 362                 return;
 363         }
 364
 365         updated = (int)hw_atomic_sub(lckcnt, 1);
 366         assert(updated >= 0);
 367 }
 368
 369 /*
 370  * Routine:     lck_attr_alloc_init
 371  */
 372
 373 lck_attr_t *
 374 lck_attr_alloc_init(
 375         void)
 376 {
 377         lck_attr_t      *attr;
 378
 379         if ((attr = (lck_attr_t *)kalloc(sizeof(lck_attr_t))) != 0)
 380                 lck_attr_setdefault(attr);
 381
 382         return(attr);
 383 }
 384
 385
 386 /*
 387  * Routine:     lck_attr_setdefault
 388  */
 389
 390 void
 391 lck_attr_setdefault(
 392         lck_attr_t      *attr)
 393 {
 394 #if   __i386__ || __x86_64__
 395 #if     !DEBUG
 396         if (LcksOpts & enaLkDeb)
 397                 attr->lck_attr_val =  LCK_ATTR_DEBUG;
 398         else
 399                 attr->lck_attr_val =  LCK_ATTR_NONE;
 400 #else
 401         attr->lck_attr_val =  LCK_ATTR_DEBUG;
 402 #endif  /* !DEBUG */
 403 #else
 404 #error Unknown architecture.
 405 #endif  /* __arm__ */
 406 }
 407
 408
 409 /*
 410  * Routine:     lck_attr_setdebug
 411  */
 412 void
 413 lck_attr_setdebug(
 414         lck_attr_t      *attr)
 415 {
 416         (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_DEBUG);
 417 }
 418
 419 /*
 420  * Routine:     lck_attr_setdebug
 421  */
 422 void
 423 lck_attr_cleardebug(
 424         lck_attr_t      *attr)
 425 {
 426         (void)hw_atomic_and(&attr->lck_attr_val, ~LCK_ATTR_DEBUG);
 427 }
 428
 429
 430 /*
 431  * Routine:     lck_attr_rw_shared_priority
 432  */
 433 void
 434 lck_attr_rw_shared_priority(
 435         lck_attr_t      *attr)
 436 {
 437         (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_RW_SHARED_PRIORITY);
 438 }
 439
 440
 441 /*
 442  * Routine:     lck_attr_free
 443  */
 444 void
 445 lck_attr_free(
 446         lck_attr_t      *attr)
 447 {
 448         kfree(attr, sizeof(lck_attr_t));
 449 }
 450
 451 /*
 452  * Routine:     hw_lock_init
 453  *
 454  *      Initialize a hardware lock.
 455  */
 456 void
 457 hw_lock_init(hw_lock_t lock)
 458 {
 459         ordered_store_hw(lock, 0);
 460 }
 461
 462 /*
 463  *      Routine: hw_lock_lock_contended
 464  *
 465  *      Spin until lock is acquired or timeout expires.
 466  *      timeout is in mach_absolute_time ticks.
 467  *      MACH_RT:  called with preemption disabled.
 468  */
 469
 470 #if     __SMP__
 471 static unsigned int NOINLINE
 472 hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic)
 473 {
 474         uint64_t        end = 0;
 475         uintptr_t       holder = lock->lock_data;
 476         int             i;
 477
 478         if (timeout == 0)
 479                 timeout = LOCK_PANIC_TIMEOUT;
 480
 481         for ( ; ; ) {
 482                 for (i = 0; i < LOCK_SNOOP_SPINS; i++) {
 483                         boolean_t       wait = FALSE;
 484
 485                         cpu_pause();
 486 #if (!__ARM_ENABLE_WFE_) || (LOCK_PRETEST)
 487                         holder = ordered_load_hw(lock);
 488                         if (holder != 0)
 489                                 continue;
 490 #endif
 491 #if __ARM_ENABLE_WFE_
 492                         wait = TRUE;    // Wait for event
 493 #endif
 494                         if (atomic_compare_exchange(&lock->lock_data, 0, data,
 495                             memory_order_acquire_smp, wait))
 496                                 return 1;
 497                 }
 498                 if (end == 0)
 499                         end = ml_get_timebase() + timeout;
 500                 else if (ml_get_timebase() >= end)
 501                         break;
 502         }
 503         if (do_panic) {
 504                 // Capture the actual time spent blocked, which may be higher than the timeout
 505                 // if a misbehaving interrupt stole this thread's CPU time.
 506                 panic("Spinlock timeout after %llu ticks, %p = %lx",
 507                         (ml_get_timebase() - end + timeout), lock, holder);
 508         }
 509         return 0;
 510 }
 511 #endif  // __SMP__
 512
 513 /*
 514  *      Routine: hw_lock_lock
 515  *
 516  *      Acquire lock, spinning until it becomes available.
 517  *      MACH_RT:  also return with preemption disabled.
 518  */
 519 void
 520 hw_lock_lock(hw_lock_t lock)
 521 {
 522         thread_t        thread;
 523         uintptr_t       state;
 524
 525         thread = current_thread();
 526         disable_preemption_for_thread(thread);
 527         state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 528 #if     __SMP__
 529 #if     LOCK_PRETEST
 530         if (ordered_load_hw(lock))
 531                 goto contended;
 532 #endif  // LOCK_PRETEST
 533         if (atomic_compare_exchange(&lock->lock_data, 0, state,
 534                                         memory_order_acquire_smp, TRUE))
 535                 return;
 536 #if     LOCK_PRETEST
 537 contended:
 538 #endif  // LOCK_PRETEST
 539         hw_lock_lock_contended(lock, state, 0, TRUE);
 540 #else   // __SMP__
 541         if (lock->lock_data)
 542                 panic("Spinlock held %p", lock);
 543         lock->lock_data = state;
 544 #endif  // __SMP__
 545         return;
 546 }
 547
 548 /*
 549  *      Routine: hw_lock_to
 550  *
 551  *      Acquire lock, spinning until it becomes available or timeout.
 552  *      timeout is in mach_absolute_time ticks.
 553  *      MACH_RT:  also return with preemption disabled.
 554  */
 555 unsigned int
 556 hw_lock_to(hw_lock_t lock, uint64_t timeout)
 557 {
 558         thread_t        thread;
 559         uintptr_t       state;
 560
 561         thread = current_thread();
 562         disable_preemption_for_thread(thread);
 563         state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 564 #if     __SMP__
 565 #if     LOCK_PRETEST
 566         if (ordered_load_hw(lock))
 567                 goto contended;
 568 #endif  // LOCK_PRETEST
 569         if (atomic_compare_exchange(&lock->lock_data, 0, state,
 570                                         memory_order_acquire_smp, TRUE))
 571                 return 1;
 572 #if     LOCK_PRETEST
 573 contended:
 574 #endif  // LOCK_PRETEST
 575         return hw_lock_lock_contended(lock, state, timeout, FALSE);
 576 #else   // __SMP__
 577         (void)timeout;
 578         if (ordered_load_hw(lock) == 0) {
 579                 ordered_store_hw(lock, state);
 580                 return 1;
 581         }
 582         return 0;
 583 #endif  // __SMP__
 584 }
 585
 586 /*
 587  *      Routine: hw_lock_try
 588  *      MACH_RT:  returns with preemption disabled on success.
 589  */
 590 unsigned int
 591 hw_lock_try(hw_lock_t lock)
 592 {
 593         thread_t        thread = current_thread();
 594         int             success = 0;
 595 #if     LOCK_TRY_DISABLE_INT
 596         long            intmask;
 597
 598         intmask = disable_interrupts();
 599 #else
 600         disable_preemption_for_thread(thread);
 601 #endif  // LOCK_TRY_DISABLE_INT
 602
 603 #if     __SMP__
 604 #if     LOCK_PRETEST
 605         if (ordered_load_hw(lock))
 606                 goto failed;
 607 #endif  // LOCK_PRETEST
 608         success = atomic_compare_exchange(&lock->lock_data, 0, LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK,
 609                                         memory_order_acquire_smp, FALSE);
 610 #else
 611         if (lock->lock_data == 0) {
 612                 lock->lock_data = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 613                 success = 1;
 614         }
 615 #endif  // __SMP__
 616
 617 #if     LOCK_TRY_DISABLE_INT
 618         if (success)
 619                 disable_preemption_for_thread(thread);
 620 #if     LOCK_PRETEST
 621 failed:
 622 #endif  // LOCK_PRETEST
 623         restore_interrupts(intmask);
 624 #else
 625 #if     LOCK_PRETEST
 626 failed:
 627 #endif  // LOCK_PRETEST
 628         if (!success)
 629                 enable_preemption();
 630 #endif  // LOCK_TRY_DISABLE_INT
 631         return success;
 632 }
 633
 634 /*
 635  *      Routine: hw_lock_unlock
 636  *
 637  *      Unconditionally release lock.
 638  *      MACH_RT:  release preemption level.
 639  */
 640 void
 641 hw_lock_unlock(hw_lock_t lock)
 642 {
 643         __c11_atomic_store((_Atomic uintptr_t *)&lock->lock_data, 0, memory_order_release_smp);
 644         enable_preemption();
 645 }
 646
 647 /*
 648  *      RoutineL hw_lock_held
 649  *      MACH_RT:  doesn't change preemption state.
 650  *      N.B.  Racy, of course.
 651  */
 652 unsigned int
 653 hw_lock_held(hw_lock_t lock)
 654 {
 655         return (ordered_load_hw(lock) != 0);
 656 }
 657
 658 /*
 659  * Routine:     lck_spin_sleep
 660  */
 661 wait_result_t
 662 lck_spin_sleep(
 663         lck_spin_t              *lck,
 664         lck_sleep_action_t      lck_sleep_action,
 665         event_t                 event,
 666         wait_interrupt_t        interruptible)
 667 {
 668         wait_result_t   res;
 669
 670         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
 671                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
 672
 673         res = assert_wait(event, interruptible);
 674         if (res == THREAD_WAITING) {
 675                 lck_spin_unlock(lck);
 676                 res = thread_block(THREAD_CONTINUE_NULL);
 677                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK))
 678                         lck_spin_lock(lck);
 679         }
 680         else
 681         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
 682                 lck_spin_unlock(lck);
 683
 684         return res;
 685 }
 686
 687
 688 /*
 689  * Routine:     lck_spin_sleep_deadline
 690  */
 691 wait_result_t
 692 lck_spin_sleep_deadline(
 693         lck_spin_t              *lck,
 694         lck_sleep_action_t      lck_sleep_action,
 695         event_t                 event,
 696         wait_interrupt_t        interruptible,
 697         uint64_t                deadline)
 698 {
 699         wait_result_t   res;
 700
 701         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
 702                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
 703
 704         res = assert_wait_deadline(event, interruptible, deadline);
 705         if (res == THREAD_WAITING) {
 706                 lck_spin_unlock(lck);
 707                 res = thread_block(THREAD_CONTINUE_NULL);
 708                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK))
 709                         lck_spin_lock(lck);
 710         }
 711         else
 712         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
 713                 lck_spin_unlock(lck);
 714
 715         return res;
 716 }
 717
 718
 719 /*
 720  * Routine:     lck_mtx_clear_promoted
 721  *
 722  * Handle clearing of TH_SFLAG_PROMOTED,
 723  * adjusting thread priority as needed.
 724  *
 725  * Called with thread lock held
 726  */
 727 static void
 728 lck_mtx_clear_promoted (
 729         thread_t                        thread,
 730         __kdebug_only uintptr_t         trace_lck)
 731 {
 732         thread->sched_flags &= ~TH_SFLAG_PROMOTED;
 733
 734         if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
 735                 /* Thread still has a RW lock promotion */
 736         } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
 737                 KERNEL_DEBUG_CONSTANT(
 738                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
 739                                 thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
 740                 set_sched_pri(thread, DEPRESSPRI);
 741         } else {
 742                 if (thread->base_pri < thread->sched_pri) {
 743                         KERNEL_DEBUG_CONSTANT(
 744                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
 745                                         thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
 746                 }
 747                 thread_recompute_sched_pri(thread, FALSE);
 748         }
 749 }
 750
 751
 752 /*
 753  * Routine:     lck_mtx_sleep
 754  */
 755 wait_result_t
 756 lck_mtx_sleep(
 757         lck_mtx_t               *lck,
 758         lck_sleep_action_t      lck_sleep_action,
 759         event_t                 event,
 760         wait_interrupt_t        interruptible)
 761 {
 762         wait_result_t   res;
 763         thread_t                thread = current_thread();
 764
 765         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_START,
 766                      VM_KERNEL_UNSLIDE_OR_PERM(lck), (int)lck_sleep_action, VM_KERNEL_UNSLIDE_OR_PERM(event), (int)interruptible, 0);
 767
 768         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
 769                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
 770
 771         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
 772                 /*
 773                  * We overload the RW lock promotion to give us a priority ceiling
 774                  * during the time that this thread is asleep, so that when it
 775                  * is re-awakened (and not yet contending on the mutex), it is
 776                  * runnable at a reasonably high priority.
 777                  */
 778                 thread->rwlock_count++;
 779         }
 780
 781         res = assert_wait(event, interruptible);
 782         if (res == THREAD_WAITING) {
 783                 lck_mtx_unlock(lck);
 784                 res = thread_block(THREAD_CONTINUE_NULL);
 785                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
 786                         if ((lck_sleep_action & LCK_SLEEP_SPIN))
 787                                 lck_mtx_lock_spin(lck);
 788                         else
 789                                 lck_mtx_lock(lck);
 790                 }
 791         }
 792         else
 793         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
 794                 lck_mtx_unlock(lck);
 795
 796         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
 797                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
 798                         /* sched_flags checked without lock, but will be rechecked while clearing */
 799                         lck_rw_clear_promotion(thread);
 800                 }
 801         }
 802
 803         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0);
 804
 805         return res;
 806 }
 807
 808
 809 /*
 810  * Routine:     lck_mtx_sleep_deadline
 811  */
 812 wait_result_t
 813 lck_mtx_sleep_deadline(
 814         lck_mtx_t               *lck,
 815         lck_sleep_action_t      lck_sleep_action,
 816         event_t                 event,
 817         wait_interrupt_t        interruptible,
 818         uint64_t                deadline)
 819 {
 820         wait_result_t   res;
 821         thread_t                thread = current_thread();
 822
 823         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_START,
 824                      VM_KERNEL_UNSLIDE_OR_PERM(lck), (int)lck_sleep_action, VM_KERNEL_UNSLIDE_OR_PERM(event), (int)interruptible, 0);
 825
 826         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
 827                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
 828
 829         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
 830                 /*
 831                  * See lck_mtx_sleep().
 832                  */
 833                 thread->rwlock_count++;
 834         }
 835
 836         res = assert_wait_deadline(event, interruptible, deadline);
 837         if (res == THREAD_WAITING) {
 838                 lck_mtx_unlock(lck);
 839                 res = thread_block(THREAD_CONTINUE_NULL);
 840                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
 841                         if ((lck_sleep_action & LCK_SLEEP_SPIN))
 842                                 lck_mtx_lock_spin(lck);
 843                         else
 844                                 lck_mtx_lock(lck);
 845                 }
 846         }
 847         else
 848         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
 849                 lck_mtx_unlock(lck);
 850
 851         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
 852                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
 853                         /* sched_flags checked without lock, but will be rechecked while clearing */
 854                         lck_rw_clear_promotion(thread);
 855                 }
 856         }
 857
 858         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0);
 859
 860         return res;
 861 }
 862
 863 /*
 864  * Routine:     lck_mtx_lock_wait
 865  *
 866  * Invoked in order to wait on contention.
 867  *
 868  * Called with the interlock locked and
 869  * returns it unlocked.
 870  */
 871 void
 872 lck_mtx_lock_wait (
 873         lck_mtx_t                       *lck,
 874         thread_t                        holder)
 875 {
 876         thread_t                self = current_thread();
 877         lck_mtx_t               *mutex;
 878         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
 879         __kdebug_only uintptr_t trace_holder = VM_KERNEL_UNSLIDE_OR_PERM(holder);
 880         integer_t               priority;
 881         spl_t                   s = splsched();
 882 #if     CONFIG_DTRACE
 883         uint64_t                sleep_start = 0;
 884
 885         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
 886                 sleep_start = mach_absolute_time();
 887         }
 888 #endif
 889
 890         if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
 891                 mutex = lck;
 892         else
 893                 mutex = &lck->lck_mtx_ptr->lck_mtx;
 894
 895         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, trace_lck, trace_holder, 0, 0, 0);
 896
 897         priority = self->sched_pri;
 898         if (priority < self->base_pri)
 899                 priority = self->base_pri;
 900         if (priority < BASEPRI_DEFAULT)
 901                 priority = BASEPRI_DEFAULT;
 902
 903         /* Do not promote past promotion ceiling */
 904         priority = MIN(priority, MAXPRI_PROMOTE);
 905
 906         thread_lock(holder);
 907         if (mutex->lck_mtx_pri == 0) {
 908                 holder->promotions++;
 909                 holder->sched_flags |= TH_SFLAG_PROMOTED;
 910         }
 911
 912         if (mutex->lck_mtx_pri < priority && holder->sched_pri < priority) {
 913                 KERNEL_DEBUG_CONSTANT(
 914                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
 915                                         holder->sched_pri, priority, trace_holder, trace_lck, 0);
 916                 set_sched_pri(holder, priority);
 917         }
 918         thread_unlock(holder);
 919         splx(s);
 920
 921         if (mutex->lck_mtx_pri < priority)
 922                 mutex->lck_mtx_pri = priority;
 923         if (self->pending_promoter[self->pending_promoter_index] == NULL) {
 924                 self->pending_promoter[self->pending_promoter_index] = mutex;
 925                 mutex->lck_mtx_waiters++;
 926         }
 927         else
 928         if (self->pending_promoter[self->pending_promoter_index] != mutex) {
 929                 self->pending_promoter[++self->pending_promoter_index] = mutex;
 930                 mutex->lck_mtx_waiters++;
 931         }
 932
 933         assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
 934         lck_mtx_ilk_unlock(mutex);
 935
 936         thread_block(THREAD_CONTINUE_NULL);
 937
 938         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
 939 #if     CONFIG_DTRACE
 940         /*
 941          * Record the Dtrace lockstat probe for blocking, block time
 942          * measured from when we were entered.
 943          */
 944         if (sleep_start) {
 945                 if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
 946                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, lck,
 947                             mach_absolute_time() - sleep_start);
 948                 } else {
 949                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, lck,
 950                             mach_absolute_time() - sleep_start);
 951                 }
 952         }
 953 #endif
 954 }
 955
 956 /*
 957  * Routine:     lck_mtx_lock_acquire
 958  *
 959  * Invoked on acquiring the mutex when there is
 960  * contention.
 961  *
 962  * Returns the current number of waiters.
 963  *
 964  * Called with the interlock locked.
 965  */
 966 int
 967 lck_mtx_lock_acquire(
 968         lck_mtx_t               *lck)
 969 {
 970         thread_t                thread = current_thread();
 971         lck_mtx_t               *mutex;
 972         integer_t               priority;
 973         spl_t                   s;
 974         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
 975
 976         if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
 977                 mutex = lck;
 978         else
 979                 mutex = &lck->lck_mtx_ptr->lck_mtx;
 980
 981         if (thread->pending_promoter[thread->pending_promoter_index] == mutex) {
 982                 thread->pending_promoter[thread->pending_promoter_index] = NULL;
 983                 if (thread->pending_promoter_index > 0)
 984                         thread->pending_promoter_index--;
 985                 mutex->lck_mtx_waiters--;
 986         }
 987
 988         if (mutex->lck_mtx_waiters)
 989                 priority = mutex->lck_mtx_pri;
 990         else {
 991                 mutex->lck_mtx_pri = 0;
 992                 priority = 0;
 993         }
 994
 995         if (priority || thread->was_promoted_on_wakeup) {
 996                 s = splsched();
 997                 thread_lock(thread);
 998
 999                 if (priority) {
1000                         thread->promotions++;
1001                         thread->sched_flags |= TH_SFLAG_PROMOTED;
1002                         if (thread->sched_pri < priority) {
1003                                 KERNEL_DEBUG_CONSTANT(
1004                                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
1005                                                         thread->sched_pri, priority, 0, trace_lck, 0);
1006                                 /* Do not promote past promotion ceiling */
1007                                 assert(priority <= MAXPRI_PROMOTE);
1008                                 set_sched_pri(thread, priority);
1009                         }
1010                 }
1011                 if (thread->was_promoted_on_wakeup) {
1012                         thread->was_promoted_on_wakeup = 0;
1013                         if (thread->promotions == 0)
1014                                 lck_mtx_clear_promoted(thread, trace_lck);
1015                 }
1016
1017                 thread_unlock(thread);
1018                 splx(s);
1019         }
1020
1021 #if CONFIG_DTRACE
1022         if (lockstat_probemap[LS_LCK_MTX_LOCK_ACQUIRE] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_ACQUIRE]) {
1023                 if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
1024                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lck, 0);
1025                 } else {
1026                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, lck, 0);
1027                 }
1028         }
1029 #endif
1030         return (mutex->lck_mtx_waiters);
1031 }
1032
1033 /*
1034  * Routine:     lck_mtx_unlock_wakeup
1035  *
1036  * Invoked on unlock when there is contention.
1037  *
1038  * Called with the interlock locked.
1039  */
1040 void
1041 lck_mtx_unlock_wakeup (
1042         lck_mtx_t                       *lck,
1043         thread_t                        holder)
1044 {
1045         thread_t                thread = current_thread();
1046         lck_mtx_t               *mutex;
1047         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1048
1049         if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
1050                 mutex = lck;
1051         else
1052                 mutex = &lck->lck_mtx_ptr->lck_mtx;
1053
1054         if (thread != holder)
1055                 panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder);
1056
1057         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder), 0, 0, 0);
1058
1059         assert(mutex->lck_mtx_waiters > 0);
1060         if (mutex->lck_mtx_waiters > 1)
1061                 thread_wakeup_one_with_pri(LCK_MTX_EVENT(lck), lck->lck_mtx_pri);
1062         else
1063                 thread_wakeup_one(LCK_MTX_EVENT(lck));
1064
1065         if (thread->promotions > 0) {
1066                 spl_t           s = splsched();
1067
1068                 thread_lock(thread);
1069                 if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED))
1070                         lck_mtx_clear_promoted(thread, trace_lck);
1071                 thread_unlock(thread);
1072                 splx(s);
1073         }
1074
1075         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
1076 }
1077
1078 void
1079 lck_mtx_unlockspin_wakeup (
1080         lck_mtx_t                       *lck)
1081 {
1082         assert(lck->lck_mtx_waiters > 0);
1083         thread_wakeup_one(LCK_MTX_EVENT(lck));
1084
1085         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(lck), 0, 0, 1, 0);
1086 #if CONFIG_DTRACE
1087         /*
1088          * When there are waiters, we skip the hot-patch spot in the
1089          * fastpath, so we record it here.
1090          */
1091         LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lck, 0);
1092 #endif
1093 }
1094
1095
1096 /*
1097  * Routine:     mutex_pause
1098  *
1099  * Called by former callers of simple_lock_pause().
1100  */
1101 #define MAX_COLLISION_COUNTS    32
1102 #define MAX_COLLISION   8
1103
1104 unsigned int max_collision_count[MAX_COLLISION_COUNTS];
1105
1106 uint32_t collision_backoffs[MAX_COLLISION] = {
1107         10, 50, 100, 200, 400, 600, 800, 1000
1108 };
1109
1110
1111 void
1112 mutex_pause(uint32_t collisions)
1113 {
1114         wait_result_t wait_result;
1115         uint32_t        back_off;
1116
1117         if (collisions >= MAX_COLLISION_COUNTS)
1118                 collisions = MAX_COLLISION_COUNTS - 1;
1119         max_collision_count[collisions]++;
1120
1121         if (collisions >= MAX_COLLISION)
1122                 collisions = MAX_COLLISION - 1;
1123         back_off = collision_backoffs[collisions];
1124
1125         wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
1126         assert(wait_result == THREAD_WAITING);
1127
1128         wait_result = thread_block(THREAD_CONTINUE_NULL);
1129         assert(wait_result == THREAD_TIMED_OUT);
1130 }
1131
1132
1133 unsigned int mutex_yield_wait = 0;
1134 unsigned int mutex_yield_no_wait = 0;
1135
1136 void
1137 lck_mtx_yield(
1138             lck_mtx_t   *lck)
1139 {
1140         int     waiters;
1141
1142 #if DEBUG
1143         lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
1144 #endif /* DEBUG */
1145
1146         if (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT)
1147                 waiters = lck->lck_mtx_ptr->lck_mtx.lck_mtx_waiters;
1148         else
1149                 waiters = lck->lck_mtx_waiters;
1150
1151         if ( !waiters) {
1152                 mutex_yield_no_wait++;
1153         } else {
1154                 mutex_yield_wait++;
1155                 lck_mtx_unlock(lck);
1156                 mutex_pause(0);
1157                 lck_mtx_lock(lck);
1158         }
1159 }
1160
1161
1162 /*
1163  * Routine:     lck_rw_sleep
1164  */
1165 wait_result_t
1166 lck_rw_sleep(
1167         lck_rw_t                *lck,
1168         lck_sleep_action_t      lck_sleep_action,
1169         event_t                 event,
1170         wait_interrupt_t        interruptible)
1171 {
1172         wait_result_t   res;
1173         lck_rw_type_t   lck_rw_type;
1174         thread_t                thread = current_thread();
1175
1176         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
1177                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
1178
1179         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
1180                 /*
1181                  * Although we are dropping the RW lock, the intent in most cases
1182                  * is that this thread remains as an observer, since it may hold
1183                  * some secondary resource, but must yield to avoid deadlock. In
1184                  * this situation, make sure that the thread is boosted to the
1185                  * RW lock ceiling while blocked, so that it can re-acquire the
1186                  * RW lock at that priority.
1187                  */
1188                 thread->rwlock_count++;
1189         }
1190
1191         res = assert_wait(event, interruptible);
1192         if (res == THREAD_WAITING) {
1193                 lck_rw_type = lck_rw_done(lck);
1194                 res = thread_block(THREAD_CONTINUE_NULL);
1195                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
1196                         if (!(lck_sleep_action & (LCK_SLEEP_SHARED|LCK_SLEEP_EXCLUSIVE)))
1197                                 lck_rw_lock(lck, lck_rw_type);
1198                         else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE)
1199                                 lck_rw_lock_exclusive(lck);
1200                         else
1201                                 lck_rw_lock_shared(lck);
1202                 }
1203         }
1204         else
1205         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
1206                 (void)lck_rw_done(lck);
1207
1208         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
1209                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1210                         /* sched_flags checked without lock, but will be rechecked while clearing */
1211
1212                         /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
1213                         assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
1214
1215                         lck_rw_clear_promotion(thread);
1216                 }
1217         }
1218
1219         return res;
1220 }
1221
1222
1223 /*
1224  * Routine:     lck_rw_sleep_deadline
1225  */
1226 wait_result_t
1227 lck_rw_sleep_deadline(
1228         lck_rw_t                *lck,
1229         lck_sleep_action_t      lck_sleep_action,
1230         event_t                 event,
1231         wait_interrupt_t        interruptible,
1232         uint64_t                deadline)
1233 {
1234         wait_result_t   res;
1235         lck_rw_type_t   lck_rw_type;
1236         thread_t                thread = current_thread();
1237
1238         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
1239                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
1240
1241         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
1242                 thread->rwlock_count++;
1243         }
1244
1245         res = assert_wait_deadline(event, interruptible, deadline);
1246         if (res == THREAD_WAITING) {
1247                 lck_rw_type = lck_rw_done(lck);
1248                 res = thread_block(THREAD_CONTINUE_NULL);
1249                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
1250                         if (!(lck_sleep_action & (LCK_SLEEP_SHARED|LCK_SLEEP_EXCLUSIVE)))
1251                                 lck_rw_lock(lck, lck_rw_type);
1252                         else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE)
1253                                 lck_rw_lock_exclusive(lck);
1254                         else
1255                                 lck_rw_lock_shared(lck);
1256                 }
1257         }
1258         else
1259         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
1260                 (void)lck_rw_done(lck);
1261
1262         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
1263                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1264                         /* sched_flags checked without lock, but will be rechecked while clearing */
1265
1266                         /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
1267                         assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
1268
1269                         lck_rw_clear_promotion(thread);
1270                 }
1271         }
1272
1273         return res;
1274 }
1275
1276 /*
1277  * Reader-writer lock promotion
1278  *
1279  * We support a limited form of reader-writer
1280  * lock promotion whose effects are:
1281  *
1282  *   * Qualifying threads have decay disabled
1283  *   * Scheduler priority is reset to a floor of
1284  *     of their statically assigned priority
1285  *     or BASEPRI_BACKGROUND
1286  *
1287  * The rationale is that lck_rw_ts do not have
1288  * a single owner, so we cannot apply a directed
1289  * priority boost from all waiting threads
1290  * to all holding threads without maintaining
1291  * lists of all shared owners and all waiting
1292  * threads for every lock.
1293  *
1294  * Instead (and to preserve the uncontended fast-
1295  * path), acquiring (or attempting to acquire)
1296  * a RW lock in shared or exclusive lock increments
1297  * a per-thread counter. Only if that thread stops
1298  * making forward progress (for instance blocking
1299  * on a mutex, or being preempted) do we consult
1300  * the counter and apply the priority floor.
1301  * When the thread becomes runnable again (or in
1302  * the case of preemption it never stopped being
1303  * runnable), it has the priority boost and should
1304  * be in a good position to run on the CPU and
1305  * release all RW locks (at which point the priority
1306  * boost is cleared).
1307  *
1308  * Care must be taken to ensure that priority
1309  * boosts are not retained indefinitely, since unlike
1310  * mutex priority boosts (where the boost is tied
1311  * to the mutex lifecycle), the boost is tied
1312  * to the thread and independent of any particular
1313  * lck_rw_t. Assertions are in place on return
1314  * to userspace so that the boost is not held
1315  * indefinitely.
1316  *
1317  * The routines that increment/decrement the
1318  * per-thread counter should err on the side of
1319  * incrementing any time a preemption is possible
1320  * and the lock would be visible to the rest of the
1321  * system as held (so it should be incremented before
1322  * interlocks are dropped/preemption is enabled, or
1323  * before a CAS is executed to acquire the lock).
1324  *
1325  */
1326
1327 /*
1328  * lck_rw_clear_promotion: Undo priority promotions when the last RW
1329  * lock is released by a thread (if a promotion was active)
1330  */
1331 void lck_rw_clear_promotion(thread_t thread)
1332 {
1333         assert(thread->rwlock_count == 0);
1334
1335         /* Cancel any promotions if the thread had actually blocked while holding a RW lock */
1336         spl_t s = splsched();
1337
1338         thread_lock(thread);
1339
1340         if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
1341                 thread->sched_flags &= ~TH_SFLAG_RW_PROMOTED;
1342
1343                 if (thread->sched_flags & TH_SFLAG_PROMOTED) {
1344                         /* Thread still has a mutex promotion */
1345                 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1346                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
1347                                               (uintptr_t)thread_tid(thread), thread->sched_pri, DEPRESSPRI, 0, 0);
1348
1349                         set_sched_pri(thread, DEPRESSPRI);
1350                 } else {
1351                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
1352                                               (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, 0, 0);
1353
1354                         thread_recompute_sched_pri(thread, FALSE);
1355                 }
1356         }
1357
1358         thread_unlock(thread);
1359         splx(s);
1360 }
1361
1362 /*
1363  * Callout from context switch if the thread goes
1364  * off core with a positive rwlock_count
1365  *
1366  * Called at splsched with the thread locked
1367  */
1368 void
1369 lck_rw_set_promotion_locked(thread_t thread)
1370 {
1371         if (LcksOpts & disLkRWPrio)
1372                 return;
1373
1374         integer_t priority;
1375
1376         priority = thread->sched_pri;
1377
1378         if (priority < thread->base_pri)
1379                 priority = thread->base_pri;
1380         if (priority < BASEPRI_BACKGROUND)
1381                 priority = BASEPRI_BACKGROUND;
1382
1383         if ((thread->sched_pri < priority) ||
1384             !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1385                 KERNEL_DEBUG_CONSTANT(
1386                         MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE,
1387                         (uintptr_t)thread_tid(thread), thread->sched_pri,
1388                         thread->base_pri, priority, 0);
1389
1390                 thread->sched_flags |= TH_SFLAG_RW_PROMOTED;
1391
1392                 if (thread->sched_pri < priority)
1393                         set_sched_pri(thread, priority);
1394         }
1395 }
1396
1397 kern_return_t
1398 host_lockgroup_info(
1399         host_t                                  host,
1400         lockgroup_info_array_t  *lockgroup_infop,
1401         mach_msg_type_number_t  *lockgroup_infoCntp)
1402 {
1403         lockgroup_info_t        *lockgroup_info_base;
1404         lockgroup_info_t        *lockgroup_info;
1405         vm_offset_t                     lockgroup_info_addr;
1406         vm_size_t                       lockgroup_info_size;
1407         vm_size_t                       lockgroup_info_vmsize;
1408         lck_grp_t                       *lck_grp;
1409         unsigned int            i;
1410         vm_map_copy_t           copy;
1411         kern_return_t           kr;
1412
1413         if (host == HOST_NULL)
1414                 return KERN_INVALID_HOST;
1415
1416         lck_mtx_lock(&lck_grp_lock);
1417
1418         lockgroup_info_size = lck_grp_cnt * sizeof(*lockgroup_info);
1419         lockgroup_info_vmsize = round_page(lockgroup_info_size);
1420         kr = kmem_alloc_pageable(ipc_kernel_map,
1421                                                  &lockgroup_info_addr, lockgroup_info_vmsize, VM_KERN_MEMORY_IPC);
1422         if (kr != KERN_SUCCESS) {
1423                 lck_mtx_unlock(&lck_grp_lock);
1424                 return(kr);
1425         }
1426
1427         lockgroup_info_base = (lockgroup_info_t *) lockgroup_info_addr;
1428         lck_grp = (lck_grp_t *)queue_first(&lck_grp_queue);
1429         lockgroup_info = lockgroup_info_base;
1430
1431         for (i = 0; i < lck_grp_cnt; i++) {
1432
1433                 lockgroup_info->lock_spin_cnt = lck_grp->lck_grp_spincnt;
1434                 lockgroup_info->lock_spin_util_cnt = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_util_cnt;
1435                 lockgroup_info->lock_spin_held_cnt = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_held_cnt;
1436                 lockgroup_info->lock_spin_miss_cnt = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_miss_cnt;
1437                 lockgroup_info->lock_spin_held_max = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_held_max;
1438                 lockgroup_info->lock_spin_held_cum = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_held_cum;
1439
1440                 lockgroup_info->lock_mtx_cnt = lck_grp->lck_grp_mtxcnt;
1441                 lockgroup_info->lock_mtx_util_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_util_cnt;
1442                 lockgroup_info->lock_mtx_held_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cnt;
1443                 lockgroup_info->lock_mtx_miss_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_miss_cnt;
1444                 lockgroup_info->lock_mtx_wait_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_cnt;
1445                 lockgroup_info->lock_mtx_held_max = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_max;
1446                 lockgroup_info->lock_mtx_held_cum = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cum;
1447                 lockgroup_info->lock_mtx_wait_max = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_max;
1448                 lockgroup_info->lock_mtx_wait_cum = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_cum;
1449
1450                 lockgroup_info->lock_rw_cnt = lck_grp->lck_grp_rwcnt;
1451                 lockgroup_info->lock_rw_util_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_util_cnt;
1452                 lockgroup_info->lock_rw_held_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_held_cnt;
1453                 lockgroup_info->lock_rw_miss_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_miss_cnt;
1454                 lockgroup_info->lock_rw_wait_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cnt;
1455                 lockgroup_info->lock_rw_held_max = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_held_max;
1456                 lockgroup_info->lock_rw_held_cum = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_held_cum;
1457                 lockgroup_info->lock_rw_wait_max = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_max;
1458                 lockgroup_info->lock_rw_wait_cum = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cum;
1459
1460                 (void) strncpy(lockgroup_info->lockgroup_name,lck_grp->lck_grp_name, LOCKGROUP_MAX_NAME);
1461
1462                 lck_grp = (lck_grp_t *)(queue_next((queue_entry_t)(lck_grp)));
1463                 lockgroup_info++;
1464         }
1465
1466         *lockgroup_infoCntp = lck_grp_cnt;
1467         lck_mtx_unlock(&lck_grp_lock);
1468
1469         if (lockgroup_info_size != lockgroup_info_vmsize)
1470                 bzero((char *)lockgroup_info, lockgroup_info_vmsize - lockgroup_info_size);
1471
1472         kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)lockgroup_info_addr,
1473                            (vm_map_size_t)lockgroup_info_size, TRUE, &copy);
1474         assert(kr == KERN_SUCCESS);
1475
1476         *lockgroup_infop = (lockgroup_info_t *) copy;
1477
1478         return(KERN_SUCCESS);
1479 }
1480
1481 /*
1482  * Atomic primitives, prototyped in kern/simple_lock.h
1483  * Noret versions are more efficient on some architectures
1484  */
1485
1486 uint32_t
1487 hw_atomic_add(volatile uint32_t *dest, uint32_t delt)
1488 {
1489         ALIGN_TEST(dest,uint32_t);
1490         return __c11_atomic_fetch_add(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) + delt;
1491 }
1492
1493 uint32_t
1494 hw_atomic_sub(volatile uint32_t *dest, uint32_t delt)
1495 {
1496         ALIGN_TEST(dest,uint32_t);
1497         return __c11_atomic_fetch_sub(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) - delt;
1498 }
1499
1500 uint32_t
1501 hw_atomic_or(volatile uint32_t *dest, uint32_t mask)
1502 {
1503         ALIGN_TEST(dest,uint32_t);
1504         return __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) | mask;
1505 }
1506
1507 void
1508 hw_atomic_or_noret(volatile uint32_t *dest, uint32_t mask)
1509 {
1510         ALIGN_TEST(dest,uint32_t);
1511         __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed);
1512 }
1513
1514 uint32_t
1515 hw_atomic_and(volatile uint32_t *dest, uint32_t mask)
1516 {
1517         ALIGN_TEST(dest,uint32_t);
1518         return __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) & mask;
1519 }
1520
1521 void
1522 hw_atomic_and_noret(volatile uint32_t *dest, uint32_t mask)
1523 {
1524         ALIGN_TEST(dest,uint32_t);
1525         __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed);
1526 }
1527
1528 uint32_t
1529 hw_compare_and_store(uint32_t oldval, uint32_t newval, volatile uint32_t *dest)
1530 {
1531         ALIGN_TEST(dest,uint32_t);
1532         return __c11_atomic_compare_exchange_strong(ATOMIC_CAST(uint32_t,dest), &oldval, newval,
1533                         memory_order_acq_rel_smp, memory_order_relaxed);
1534 }
1535