osfmk/arm/locks_arm.c

   1 /*
   2  * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System Copyright (c) 1991,1990,1989,1988,1987 Carnegie
  33  * Mellon University All Rights Reserved.
  34  *
  35  * Permission to use, copy, modify and distribute this software and its
  36  * documentation is hereby granted, provided that both the copyright notice
  37  * and this permission notice appear in all copies of the software,
  38  * derivative works or modified versions, and any portions thereof, and that
  39  * both notices appear in supporting documentation.
  40  *
  41  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
  42  * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
  43  * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  44  *
  45  * Carnegie Mellon requests users of this software to return to
  46  *
  47  * Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  48  * School of Computer Science Carnegie Mellon University Pittsburgh PA
  49  * 15213-3890
  50  *
  51  * any improvements or extensions that they make and grant Carnegie Mellon the
  52  * rights to redistribute these changes.
  53  */
  54 /*
  55  *      File:   kern/lock.c
  56  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  57  *      Date:   1985
  58  *
  59  *      Locking primitives implementation
  60  */
  61
  62 #define LOCK_PRIVATE 1
  63
  64 #include <mach_ldebug.h>
  65
  66 #include <kern/zalloc.h>
  67 #include <kern/lock_stat.h>
  68 #include <kern/locks.h>
  69 #include <kern/misc_protos.h>
  70 #include <kern/thread.h>
  71 #include <kern/processor.h>
  72 #include <kern/sched_prim.h>
  73 #include <kern/debug.h>
  74 #include <kern/kcdata.h>
  75 #include <string.h>
  76 #include <arm/cpu_internal.h>
  77 #include <os/hash.h>
  78 #include <arm/cpu_data.h>
  79
  80 #include <arm/cpu_data_internal.h>
  81 #include <arm/proc_reg.h>
  82 #include <arm/smp.h>
  83 #include <machine/atomic.h>
  84 #include <machine/machine_cpu.h>
  85
  86 #include <sys/kdebug.h>
  87
  88 #if CONFIG_DTRACE
  89 #define DTRACE_RW_SHARED        0x0     //reader
  90 #define DTRACE_RW_EXCL          0x1     //writer
  91 #define DTRACE_NO_FLAG          0x0     //not applicable
  92 #endif  /* CONFIG_DTRACE */
  93
  94 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  95 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  96 #define LCK_RW_LCK_SHARED_CODE          0x102
  97 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
  98 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
  99 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 100
 101
 102 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 103
 104 // Panic in tests that check lock usage correctness
 105 // These are undesirable when in a panic or a debugger is runnning.
 106 #define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0)
 107
 108 #define ADAPTIVE_SPIN_ENABLE 0x1
 109
 110 int lck_mtx_adaptive_spin_mode = ADAPTIVE_SPIN_ENABLE;
 111
 112 #define SPINWAIT_OWNER_CHECK_COUNT 4
 113
 114 typedef enum {
 115         SPINWAIT_ACQUIRED,     /* Got the lock. */
 116         SPINWAIT_INTERLOCK,    /* Got the interlock, no owner, but caller must finish acquiring the lock. */
 117         SPINWAIT_DID_SPIN_HIGH_THR, /* Got the interlock, spun, but failed to get the lock. */
 118         SPINWAIT_DID_SPIN_OWNER_NOT_CORE, /* Got the interlock, spun, but failed to get the lock. */
 119         SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION, /* Got the interlock, spun, but failed to get the lock. */
 120         SPINWAIT_DID_SPIN_SLIDING_THR,/* Got the interlock, spun, but failed to get the lock. */
 121         SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */
 122 } spinwait_result_t;
 123
 124 #if CONFIG_DTRACE
 125 extern uint64_t dtrace_spin_threshold;
 126 #endif
 127
 128 /* Forwards */
 129
 130 extern unsigned int not_in_kdp;
 131
 132 /*
 133  *      We often want to know the addresses of the callers
 134  *      of the various lock routines.  However, this information
 135  *      is only used for debugging and statistics.
 136  */
 137 typedef void   *pc_t;
 138 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 139 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 140
 141 #ifdef  lint
 142 /*
 143  *      Eliminate lint complaints about unused local pc variables.
 144  */
 145 #define OBTAIN_PC(pc, l) ++pc
 146 #else                           /* lint */
 147 #define OBTAIN_PC(pc, l)
 148 #endif                          /* lint */
 149
 150
 151 /*
 152  *      Portable lock package implementation of usimple_locks.
 153  */
 154
 155 /*
 156  * Owner thread pointer when lock held in spin mode
 157  */
 158 #define LCK_MTX_SPIN_TAG  0xfffffff0
 159
 160
 161 #define interlock_lock(lock)    hw_lock_bit    ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
 162 #define interlock_try(lock)             hw_lock_bit_try((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
 163 #define interlock_unlock(lock)  hw_unlock_bit  ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
 164 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
 165 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
 166
 167 #define load_memory_barrier()   os_atomic_thread_fence(acquire)
 168
 169 // Enforce program order of loads and stores.
 170 #define ordered_load(target) \
 171                 os_atomic_load(target, compiler_acq_rel)
 172 #define ordered_store(target, value) \
 173                 os_atomic_store(target, value, compiler_acq_rel)
 174
 175 #define ordered_load_mtx(lock)                  ordered_load(&(lock)->lck_mtx_data)
 176 #define ordered_store_mtx(lock, value)  ordered_store(&(lock)->lck_mtx_data, (value))
 177 #define ordered_load_rw(lock)                   ordered_load(&(lock)->lck_rw_data)
 178 #define ordered_store_rw(lock, value)   ordered_store(&(lock)->lck_rw_data, (value))
 179 #define ordered_load_rw_owner(lock)             ordered_load(&(lock)->lck_rw_owner)
 180 #define ordered_store_rw_owner(lock, value)     ordered_store(&(lock)->lck_rw_owner, (value))
 181 #define ordered_load_hw(lock)                   ordered_load(&(lock)->lock_data)
 182 #define ordered_store_hw(lock, value)   ordered_store(&(lock)->lock_data, (value))
 183 #define ordered_load_bit(lock)                  ordered_load((lock))
 184 #define ordered_store_bit(lock, value)  ordered_store((lock), (value))
 185
 186
 187 // Prevent the compiler from reordering memory operations around this
 188 #define compiler_memory_fence() __asm__ volatile ("" ::: "memory")
 189
 190 #define LOCK_PANIC_TIMEOUT      0xc00000
 191 #define NOINLINE                __attribute__((noinline))
 192
 193
 194 #if __arm__
 195 #define interrupts_disabled(mask) (mask & PSR_INTMASK)
 196 #else
 197 #define interrupts_disabled(mask) (mask & DAIF_IRQF)
 198 #endif
 199
 200
 201 #if __arm__
 202 #define enable_fiq()            __asm__ volatile ("cpsie  f" ::: "memory");
 203 #define enable_interrupts()     __asm__ volatile ("cpsie if" ::: "memory");
 204 #endif
 205
 206 ZONE_VIEW_DEFINE(ZV_LCK_SPIN, "lck_spin",
 207     KHEAP_ID_DEFAULT, sizeof(lck_spin_t));
 208
 209 ZONE_VIEW_DEFINE(ZV_LCK_MTX, "lck_mtx",
 210     KHEAP_ID_DEFAULT, sizeof(lck_mtx_t));
 211
 212 ZONE_VIEW_DEFINE(ZV_LCK_MTX_EXT, "lck_mtx_ext",
 213     KHEAP_ID_DEFAULT, sizeof(lck_mtx_ext_t));
 214
 215 ZONE_VIEW_DEFINE(ZV_LCK_RW, "lck_rw",
 216     KHEAP_ID_DEFAULT, sizeof(lck_rw_t));
 217
 218 /*
 219  * Forward declarations
 220  */
 221
 222 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 223 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 224 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 225 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 226 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 227 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 228 static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait);
 229
 230 /*
 231  * atomic exchange API is a low level abstraction of the operations
 232  * to atomically read, modify, and write a pointer.  This abstraction works
 233  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 234  * well as the ARM exclusive instructions.
 235  *
 236  * atomic_exchange_begin() - begin exchange and retrieve current value
 237  * atomic_exchange_complete() - conclude an exchange
 238  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 239  */
 240 __unused static uint32_t
 241 load_exclusive32(uint32_t *target, enum memory_order ord)
 242 {
 243         uint32_t        value;
 244
 245 #if __arm__
 246         if (_os_atomic_mo_has_release(ord)) {
 247                 // Pre-load release barrier
 248                 atomic_thread_fence(memory_order_release);
 249         }
 250         value = __builtin_arm_ldrex(target);
 251 #else
 252         if (_os_atomic_mo_has_acquire(ord)) {
 253                 value = __builtin_arm_ldaex(target);    // ldaxr
 254         } else {
 255                 value = __builtin_arm_ldrex(target);    // ldxr
 256         }
 257 #endif  // __arm__
 258         return value;
 259 }
 260
 261 __unused static boolean_t
 262 store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
 263 {
 264         boolean_t err;
 265
 266 #if __arm__
 267         err = __builtin_arm_strex(value, target);
 268         if (_os_atomic_mo_has_acquire(ord)) {
 269                 // Post-store acquire barrier
 270                 atomic_thread_fence(memory_order_acquire);
 271         }
 272 #else
 273         if (_os_atomic_mo_has_release(ord)) {
 274                 err = __builtin_arm_stlex(value, target);       // stlxr
 275         } else {
 276                 err = __builtin_arm_strex(value, target);       // stxr
 277         }
 278 #endif  // __arm__
 279         return !err;
 280 }
 281
 282 static uint32_t
 283 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 284 {
 285         uint32_t        val;
 286
 287 #if __ARM_ATOMICS_8_1
 288         ord = memory_order_relaxed;
 289 #endif
 290         val = load_exclusive32(target, ord);
 291         *previous = val;
 292         return val;
 293 }
 294
 295 static boolean_t
 296 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 297 {
 298 #if __ARM_ATOMICS_8_1
 299         return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
 300 #else
 301         (void)previous;         // Previous not needed, monitor is held
 302         return store_exclusive32(target, newval, ord);
 303 #endif
 304 }
 305
 306 static void
 307 atomic_exchange_abort(void)
 308 {
 309         os_atomic_clear_exclusive();
 310 }
 311
 312 static boolean_t
 313 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 314 {
 315         uint32_t                value, prev;
 316
 317         for (;;) {
 318                 value = atomic_exchange_begin32(target, &prev, ord);
 319                 if (value & test_mask) {
 320                         if (wait) {
 321                                 wait_for_event();       // Wait with monitor held
 322                         } else {
 323                                 atomic_exchange_abort();        // Clear exclusive monitor
 324                         }
 325                         return FALSE;
 326                 }
 327                 value |= set_mask;
 328                 if (atomic_exchange_complete32(target, prev, value, ord)) {
 329                         return TRUE;
 330                 }
 331         }
 332 }
 333
 334 inline boolean_t
 335 hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 336 {
 337         return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
 338 }
 339
 340 /*
 341  * To help _disable_preemption() inline everywhere with LTO,
 342  * we keep these nice non inlineable functions as the panic()
 343  * codegen setup is quite large and for weird reasons causes a frame.
 344  */
 345 __abortlike
 346 static void
 347 _disable_preemption_overflow(void)
 348 {
 349         panic("Preemption count overflow");
 350 }
 351
 352 void
 353 _disable_preemption(void)
 354 {
 355         thread_t     thread = current_thread();
 356         unsigned int count  = thread->machine.preemption_count;
 357
 358         if (__improbable(++count == 0)) {
 359                 _disable_preemption_overflow();
 360         }
 361
 362         os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
 363 }
 364
 365 /*
 366  * This function checks whether an AST_URGENT has been pended.
 367  *
 368  * It is called once the preemption has been reenabled, which means the thread
 369  * may have been preempted right before this was called, and when this function
 370  * actually performs the check, we've changed CPU.
 371  *
 372  * This race is however benign: the point of AST_URGENT is to trigger a context
 373  * switch, so if one happened, there's nothing left to check for, and AST_URGENT
 374  * was cleared in the process.
 375  *
 376  * It follows that this check cannot have false negatives, which allows us
 377  * to avoid fiddling with interrupt state for the vast majority of cases
 378  * when the check will actually be negative.
 379  */
 380 static NOINLINE void
 381 kernel_preempt_check(thread_t thread)
 382 {
 383         cpu_data_t *cpu_data_ptr;
 384         long        state;
 385
 386 #if __arm__
 387 #define INTERRUPT_MASK PSR_IRQF
 388 #else   // __arm__
 389 #define INTERRUPT_MASK DAIF_IRQF
 390 #endif  // __arm__
 391
 392         /*
 393          * This check is racy and could load from another CPU's pending_ast mask,
 394          * but as described above, this can't have false negatives.
 395          */
 396         cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
 397         if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) {
 398                 return;
 399         }
 400
 401         /* If interrupts are masked, we can't take an AST here */
 402         state = get_interrupts();
 403         if ((state & INTERRUPT_MASK) == 0) {
 404                 disable_interrupts_noread();                    // Disable interrupts
 405
 406                 /*
 407                  * Reload cpu_data_ptr: a context switch would cause it to change.
 408                  * Now that interrupts are disabled, this will debounce false positives.
 409                  */
 410                 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
 411                 if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
 412 #if __arm__
 413 #if __ARM_USER_PROTECT__
 414                         uintptr_t up = arm_user_protect_begin(thread);
 415 #endif  // __ARM_USER_PROTECT__
 416                         enable_fiq();
 417 #endif  // __arm__
 418                         ast_taken_kernel();                 // Handle urgent AST
 419 #if __arm__
 420 #if __ARM_USER_PROTECT__
 421                         arm_user_protect_end(thread, up, TRUE);
 422 #endif  // __ARM_USER_PROTECT__
 423                         enable_interrupts();
 424                         return;                             // Return early on arm only due to FIQ enabling
 425 #endif  // __arm__
 426                 }
 427                 restore_interrupts(state);              // Enable interrupts
 428         }
 429 }
 430
 431 /*
 432  * To help _enable_preemption() inline everywhere with LTO,
 433  * we keep these nice non inlineable functions as the panic()
 434  * codegen setup is quite large and for weird reasons causes a frame.
 435  */
 436 __abortlike
 437 static void
 438 _enable_preemption_underflow(void)
 439 {
 440         panic("Preemption count underflow");
 441 }
 442
 443 void
 444 _enable_preemption(void)
 445 {
 446         thread_t     thread = current_thread();
 447         unsigned int count  = thread->machine.preemption_count;
 448
 449         if (__improbable(count == 0)) {
 450                 _enable_preemption_underflow();
 451         }
 452         count -= 1;
 453
 454         os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
 455         if (count == 0) {
 456                 kernel_preempt_check(thread);
 457         }
 458
 459         os_compiler_barrier();
 460 }
 461
 462 int
 463 get_preemption_level(void)
 464 {
 465         return current_thread()->machine.preemption_count;
 466 }
 467
 468 /*
 469  *      Routine:        lck_spin_alloc_init
 470  */
 471 lck_spin_t     *
 472 lck_spin_alloc_init(
 473         lck_grp_t * grp,
 474         lck_attr_t * attr)
 475 {
 476         lck_spin_t *lck;
 477
 478         lck = zalloc(ZV_LCK_SPIN);
 479         lck_spin_init(lck, grp, attr);
 480         return lck;
 481 }
 482
 483 /*
 484  *      Routine:        lck_spin_free
 485  */
 486 void
 487 lck_spin_free(
 488         lck_spin_t * lck,
 489         lck_grp_t * grp)
 490 {
 491         lck_spin_destroy(lck, grp);
 492         zfree(ZV_LCK_SPIN, lck);
 493 }
 494
 495 /*
 496  *      Routine:        lck_spin_init
 497  */
 498 void
 499 lck_spin_init(
 500         lck_spin_t * lck,
 501         lck_grp_t * grp,
 502         __unused lck_attr_t * attr)
 503 {
 504         lck->type = LCK_SPIN_TYPE;
 505         hw_lock_init(&lck->hwlock);
 506         if (grp) {
 507                 lck_grp_reference(grp);
 508                 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 509         }
 510 }
 511
 512 /*
 513  * arm_usimple_lock is a lck_spin_t without a group or attributes
 514  */
 515 MARK_AS_HIBERNATE_TEXT void inline
 516 arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
 517 {
 518         lck->type = LCK_SPIN_TYPE;
 519         hw_lock_init(&lck->hwlock);
 520 }
 521
 522
 523 /*
 524  *      Routine:        lck_spin_lock
 525  */
 526 void
 527 lck_spin_lock(lck_spin_t *lock)
 528 {
 529 #if     DEVELOPMENT || DEBUG
 530         if (lock->type != LCK_SPIN_TYPE) {
 531                 panic("Invalid spinlock %p", lock);
 532         }
 533 #endif  // DEVELOPMENT || DEBUG
 534         hw_lock_lock(&lock->hwlock, LCK_GRP_NULL);
 535 }
 536
 537 void
 538 lck_spin_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
 539 {
 540 #pragma unused(grp)
 541 #if     DEVELOPMENT || DEBUG
 542         if (lock->type != LCK_SPIN_TYPE) {
 543                 panic("Invalid spinlock %p", lock);
 544         }
 545 #endif  // DEVELOPMENT || DEBUG
 546         hw_lock_lock(&lock->hwlock, grp);
 547 }
 548
 549 /*
 550  *      Routine:        lck_spin_lock_nopreempt
 551  */
 552 void
 553 lck_spin_lock_nopreempt(lck_spin_t *lock)
 554 {
 555 #if     DEVELOPMENT || DEBUG
 556         if (lock->type != LCK_SPIN_TYPE) {
 557                 panic("Invalid spinlock %p", lock);
 558         }
 559 #endif  // DEVELOPMENT || DEBUG
 560         hw_lock_lock_nopreempt(&lock->hwlock, LCK_GRP_NULL);
 561 }
 562
 563 void
 564 lck_spin_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
 565 {
 566 #pragma unused(grp)
 567 #if     DEVELOPMENT || DEBUG
 568         if (lock->type != LCK_SPIN_TYPE) {
 569                 panic("Invalid spinlock %p", lock);
 570         }
 571 #endif  // DEVELOPMENT || DEBUG
 572         hw_lock_lock_nopreempt(&lock->hwlock, grp);
 573 }
 574
 575 /*
 576  *      Routine:        lck_spin_try_lock
 577  */
 578 int
 579 lck_spin_try_lock(lck_spin_t *lock)
 580 {
 581         return hw_lock_try(&lock->hwlock, LCK_GRP_NULL);
 582 }
 583
 584 int
 585 lck_spin_try_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
 586 {
 587 #pragma unused(grp)
 588         return hw_lock_try(&lock->hwlock, grp);
 589 }
 590
 591 /*
 592  *      Routine:        lck_spin_try_lock_nopreempt
 593  */
 594 int
 595 lck_spin_try_lock_nopreempt(lck_spin_t *lock)
 596 {
 597         return hw_lock_try_nopreempt(&lock->hwlock, LCK_GRP_NULL);
 598 }
 599
 600 int
 601 lck_spin_try_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
 602 {
 603 #pragma unused(grp)
 604         return hw_lock_try_nopreempt(&lock->hwlock, grp);
 605 }
 606
 607 /*
 608  *      Routine:        lck_spin_unlock
 609  */
 610 void
 611 lck_spin_unlock(lck_spin_t *lock)
 612 {
 613 #if     DEVELOPMENT || DEBUG
 614         if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
 615                 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
 616         }
 617         if (lock->type != LCK_SPIN_TYPE) {
 618                 panic("Invalid spinlock type %p", lock);
 619         }
 620 #endif  // DEVELOPMENT || DEBUG
 621         hw_lock_unlock(&lock->hwlock);
 622 }
 623
 624 /*
 625  *      Routine:        lck_spin_unlock_nopreempt
 626  */
 627 void
 628 lck_spin_unlock_nopreempt(lck_spin_t *lock)
 629 {
 630 #if     DEVELOPMENT || DEBUG
 631         if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
 632                 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
 633         }
 634         if (lock->type != LCK_SPIN_TYPE) {
 635                 panic("Invalid spinlock type %p", lock);
 636         }
 637 #endif  // DEVELOPMENT || DEBUG
 638         hw_lock_unlock_nopreempt(&lock->hwlock);
 639 }
 640
 641 /*
 642  *      Routine:        lck_spin_destroy
 643  */
 644 void
 645 lck_spin_destroy(
 646         lck_spin_t * lck,
 647         lck_grp_t * grp)
 648 {
 649         if (lck->lck_spin_data == LCK_SPIN_TAG_DESTROYED) {
 650                 return;
 651         }
 652         lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
 653         if (grp) {
 654                 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 655                 lck_grp_deallocate(grp);
 656         }
 657 }
 658
 659 /*
 660  * Routine: kdp_lck_spin_is_acquired
 661  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 662  */
 663 boolean_t
 664 kdp_lck_spin_is_acquired(lck_spin_t *lck)
 665 {
 666         if (not_in_kdp) {
 667                 panic("panic: spinlock acquired check done outside of kernel debugger");
 668         }
 669         return ((lck->lck_spin_data & ~LCK_SPIN_TAG_DESTROYED) != 0) ? TRUE:FALSE;
 670 }
 671
 672 /*
 673  *      Initialize a usimple_lock.
 674  *
 675  *      No change in preemption state.
 676  */
 677 void
 678 usimple_lock_init(
 679         usimple_lock_t l,
 680         unsigned short tag)
 681 {
 682         simple_lock_init((simple_lock_t) l, tag);
 683 }
 684
 685
 686 /*
 687  *      Acquire a usimple_lock.
 688  *
 689  *      Returns with preemption disabled.  Note
 690  *      that the hw_lock routines are responsible for
 691  *      maintaining preemption state.
 692  */
 693 void
 694 (usimple_lock)(
 695         usimple_lock_t l
 696         LCK_GRP_ARG(lck_grp_t *grp))
 697 {
 698         simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp));
 699 }
 700
 701
 702 extern void     sync(void);
 703
 704 /*
 705  *      Release a usimple_lock.
 706  *
 707  *      Returns with preemption enabled.  Note
 708  *      that the hw_lock routines are responsible for
 709  *      maintaining preemption state.
 710  */
 711 void
 712 (usimple_unlock)(
 713         usimple_lock_t l)
 714 {
 715         simple_unlock((simple_lock_t)l);
 716 }
 717
 718
 719 /*
 720  *      Conditionally acquire a usimple_lock.
 721  *
 722  *      On success, returns with preemption disabled.
 723  *      On failure, returns with preemption in the same state
 724  *      as when first invoked.  Note that the hw_lock routines
 725  *      are responsible for maintaining preemption state.
 726  *
 727  *      XXX No stats are gathered on a miss; I preserved this
 728  *      behavior from the original assembly-language code, but
 729  *      doesn't it make sense to log misses?  XXX
 730  */
 731 unsigned
 732 int
 733 (usimple_lock_try)(
 734         usimple_lock_t l
 735         LCK_GRP_ARG(lck_grp_t *grp))
 736 {
 737         return simple_lock_try((simple_lock_t) l, grp);
 738 }
 739
 740 /*
 741  * The C portion of the shared/exclusive locks package.
 742  */
 743
 744 /*
 745  * compute the deadline to spin against when
 746  * waiting for a change of state on a lck_rw_t
 747  */
 748 static inline uint64_t
 749 lck_rw_deadline_for_spin(lck_rw_t *lck)
 750 {
 751         lck_rw_word_t   word;
 752
 753         word.data = ordered_load_rw(lck);
 754         if (word.can_sleep) {
 755                 if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
 756                         /*
 757                          * there are already threads waiting on this lock... this
 758                          * implies that they have spun beyond their deadlines waiting for
 759                          * the desired state to show up so we will not bother spinning at this time...
 760                          *   or
 761                          * the current number of threads sharing this lock exceeds our capacity to run them
 762                          * concurrently and since all states we're going to spin for require the rw_shared_count
 763                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 764                          * unpredictable...
 765                          */
 766                         return mach_absolute_time();
 767                 }
 768                 return mach_absolute_time() + MutexSpin;
 769         } else {
 770                 return mach_absolute_time() + (100000LL * 1000000000LL);
 771         }
 772 }
 773
 774 static boolean_t
 775 lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unused)
 776 {
 777         uint64_t        deadline = 0;
 778         uint32_t        data;
 779
 780         if (wait) {
 781                 deadline = lck_rw_deadline_for_spin(lock);
 782         }
 783
 784         for (;;) {
 785                 data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
 786                 if ((data & status_mask) == 0) {
 787                         break;
 788                 }
 789                 if (wait) {
 790                         wait_for_event();
 791                 } else {
 792                         os_atomic_clear_exclusive();
 793                 }
 794                 if (!wait || (mach_absolute_time() >= deadline)) {
 795                         return FALSE;
 796                 }
 797         }
 798         os_atomic_clear_exclusive();
 799         return TRUE;
 800 }
 801
 802 /*
 803  * Spin while interlock is held.
 804  */
 805 static inline void
 806 lck_rw_interlock_spin(lck_rw_t *lock)
 807 {
 808         uint32_t        data;
 809
 810         for (;;) {
 811                 data = load_exclusive32(&lock->lck_rw_data, memory_order_relaxed);
 812                 if (data & LCK_RW_INTERLOCK) {
 813                         wait_for_event();
 814                 } else {
 815                         os_atomic_clear_exclusive();
 816                         return;
 817                 }
 818         }
 819 }
 820
 821 /*
 822  * We disable interrupts while holding the RW interlock to prevent an
 823  * interrupt from exacerbating hold time.
 824  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 825  */
 826 static inline boolean_t
 827 lck_interlock_lock(lck_rw_t *lck)
 828 {
 829         boolean_t       istate;
 830
 831         istate = ml_set_interrupts_enabled(FALSE);
 832         lck_rw_ilk_lock(lck);
 833         return istate;
 834 }
 835
 836 static inline void
 837 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 838 {
 839         lck_rw_ilk_unlock(lck);
 840         ml_set_interrupts_enabled(istate);
 841 }
 842
 843
 844 #define LCK_RW_GRAB_WANT        0
 845 #define LCK_RW_GRAB_SHARED      1
 846
 847 static boolean_t
 848 lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait)
 849 {
 850         uint64_t        deadline = 0;
 851         uint32_t        data, prev;
 852         boolean_t       do_exch;
 853
 854         if (wait) {
 855                 deadline = lck_rw_deadline_for_spin(lock);
 856         }
 857
 858         for (;;) {
 859                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
 860                 if (data & LCK_RW_INTERLOCK) {
 861                         atomic_exchange_abort();
 862                         lck_rw_interlock_spin(lock);
 863                         continue;
 864                 }
 865                 do_exch = FALSE;
 866                 if (mode == LCK_RW_GRAB_WANT) {
 867                         if ((data & LCK_RW_WANT_EXCL) == 0) {
 868                                 data |= LCK_RW_WANT_EXCL;
 869                                 do_exch = TRUE;
 870                         }
 871                 } else {        // LCK_RW_GRAB_SHARED
 872                         if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
 873                             (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
 874                                 data += LCK_RW_SHARED_READER;
 875                                 do_exch = TRUE;
 876                         }
 877                 }
 878                 if (do_exch) {
 879                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
 880                                 return TRUE;
 881                         }
 882                 } else {
 883                         if (wait) {                                             // Non-waiting
 884                                 wait_for_event();
 885                         } else {
 886                                 atomic_exchange_abort();
 887                         }
 888                         if (!wait || (mach_absolute_time() >= deadline)) {
 889                                 return FALSE;
 890                         }
 891                 }
 892         }
 893 }
 894
 895
 896 /*
 897  *      Routine:        lck_rw_alloc_init
 898  */
 899 lck_rw_t *
 900 lck_rw_alloc_init(
 901         lck_grp_t       *grp,
 902         lck_attr_t      *attr)
 903 {
 904         lck_rw_t *lck;
 905
 906         lck = zalloc_flags(ZV_LCK_RW, Z_WAITOK | Z_ZERO);
 907         lck_rw_init(lck, grp, attr);
 908         return lck;
 909 }
 910
 911 /*
 912  *      Routine:        lck_rw_free
 913  */
 914 void
 915 lck_rw_free(
 916         lck_rw_t        *lck,
 917         lck_grp_t       *grp)
 918 {
 919         lck_rw_destroy(lck, grp);
 920         zfree(ZV_LCK_RW, lck);
 921 }
 922
 923 /*
 924  *      Routine:        lck_rw_init
 925  */
 926 void
 927 lck_rw_init(
 928         lck_rw_t        *lck,
 929         lck_grp_t       *grp,
 930         lck_attr_t      *attr)
 931 {
 932         if (attr == LCK_ATTR_NULL) {
 933                 attr = &LockDefaultLckAttr;
 934         }
 935         memset(lck, 0, sizeof(lck_rw_t));
 936         lck->lck_rw_can_sleep = TRUE;
 937         if ((attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0) {
 938                 lck->lck_rw_priv_excl = TRUE;
 939         }
 940
 941         lck_grp_reference(grp);
 942         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 943 }
 944
 945
 946 /*
 947  *      Routine:        lck_rw_destroy
 948  */
 949 void
 950 lck_rw_destroy(
 951         lck_rw_t        *lck,
 952         lck_grp_t       *grp)
 953 {
 954         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
 955                 return;
 956         }
 957 #if MACH_LDEBUG
 958         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 959 #endif
 960         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 961         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 962         lck_grp_deallocate(grp);
 963         return;
 964 }
 965
 966 /*
 967  *      Routine:        lck_rw_lock
 968  */
 969 void
 970 lck_rw_lock(
 971         lck_rw_t                *lck,
 972         lck_rw_type_t   lck_rw_type)
 973 {
 974         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
 975                 lck_rw_lock_shared(lck);
 976         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
 977                 lck_rw_lock_exclusive(lck);
 978         } else {
 979                 panic("lck_rw_lock(): Invalid RW lock type: %x", lck_rw_type);
 980         }
 981 }
 982
 983 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
 984             (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
 985             LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
 986
 987 /*
 988  *      Routine:        lck_rw_lock_exclusive_check_contended
 989  */
 990 bool
 991 lck_rw_lock_exclusive_check_contended(lck_rw_t *lock)
 992 {
 993         thread_t        thread = current_thread();
 994         bool            contended  = false;
 995
 996         if (lock->lck_rw_can_sleep) {
 997                 thread->rwlock_count++;
 998         } else if (get_preemption_level() == 0) {
 999                 panic("Taking non-sleepable RW lock with preemption enabled");
1000         }
1001         if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1002 #if     CONFIG_DTRACE
1003                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1004 #endif  /* CONFIG_DTRACE */
1005         } else {
1006                 contended = true;
1007                 lck_rw_lock_exclusive_gen(lock);
1008         }
1009 #if MACH_ASSERT
1010         thread_t owner = ordered_load_rw_owner(lock);
1011         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1012 #endif
1013         ordered_store_rw_owner(lock, thread);
1014         return contended;
1015 }
1016
1017 /*
1018  *      Routine:        lck_rw_lock_exclusive
1019  */
1020 void
1021 lck_rw_lock_exclusive(lck_rw_t *lock)
1022 {
1023         thread_t        thread = current_thread();
1024
1025         if (lock->lck_rw_can_sleep) {
1026                 thread->rwlock_count++;
1027         } else if (get_preemption_level() == 0) {
1028                 panic("Taking non-sleepable RW lock with preemption enabled");
1029         }
1030         if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1031 #if     CONFIG_DTRACE
1032                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1033 #endif  /* CONFIG_DTRACE */
1034         } else {
1035                 lck_rw_lock_exclusive_gen(lock);
1036         }
1037 #if MACH_ASSERT
1038         thread_t owner = ordered_load_rw_owner(lock);
1039         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1040 #endif
1041         ordered_store_rw_owner(lock, thread);
1042 }
1043
1044 /*
1045  *      Routine:        lck_rw_lock_shared
1046  */
1047 void
1048 lck_rw_lock_shared(lck_rw_t *lock)
1049 {
1050         uint32_t        data, prev;
1051
1052         if (lock->lck_rw_can_sleep) {
1053                 current_thread()->rwlock_count++;
1054         } else if (get_preemption_level() == 0) {
1055                 panic("Taking non-sleepable RW lock with preemption enabled");
1056         }
1057         for (;;) {
1058                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1059                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1060                         atomic_exchange_abort();
1061                         lck_rw_lock_shared_gen(lock);
1062                         break;
1063                 }
1064                 data += LCK_RW_SHARED_READER;
1065                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1066                         break;
1067                 }
1068                 cpu_pause();
1069         }
1070 #if MACH_ASSERT
1071         thread_t owner = ordered_load_rw_owner(lock);
1072         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1073 #endif
1074 #if     CONFIG_DTRACE
1075         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1076 #endif  /* CONFIG_DTRACE */
1077         return;
1078 }
1079
1080 /*
1081  *      Routine:        lck_rw_lock_shared_to_exclusive
1082  *
1083  *      False returned upon failure, in this case the shared lock is dropped.
1084  */
1085 boolean_t
1086 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1087 {
1088         uint32_t        data, prev;
1089
1090         for (;;) {
1091                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1092                 if (data & LCK_RW_INTERLOCK) {
1093                         atomic_exchange_abort();
1094                         lck_rw_interlock_spin(lock);
1095                         continue;
1096                 }
1097                 if (data & LCK_RW_WANT_UPGRADE) {
1098                         data -= LCK_RW_SHARED_READER;
1099                         if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1100                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1101                         }
1102                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1103                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1104                         }
1105                 } else {
1106                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1107                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1108                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1109                                 break;
1110                         }
1111                 }
1112                 cpu_pause();
1113         }
1114         /* we now own the WANT_UPGRADE */
1115         if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1116                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1117         }
1118 #if MACH_ASSERT
1119         thread_t owner = ordered_load_rw_owner(lock);
1120         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1121 #endif
1122         ordered_store_rw_owner(lock, current_thread());
1123 #if     CONFIG_DTRACE
1124         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1125 #endif  /* CONFIG_DTRACE */
1126         return TRUE;
1127 }
1128
1129
1130 /*
1131  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1132  *      Function:
1133  *              Fast path code has already dropped our read
1134  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1135  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1136  *              all we need to do here is determine if a wakeup is needed
1137  */
1138 static boolean_t
1139 lck_rw_lock_shared_to_exclusive_failure(
1140         lck_rw_t        *lck,
1141         uint32_t        prior_lock_state)
1142 {
1143         thread_t        thread = current_thread();
1144         uint32_t        rwlock_count;
1145
1146         /* Check if dropping the lock means that we need to unpromote */
1147         if (lck->lck_rw_can_sleep) {
1148                 rwlock_count = thread->rwlock_count--;
1149         } else {
1150                 rwlock_count = UINT32_MAX;
1151         }
1152 #if MACH_LDEBUG
1153         if (rwlock_count == 0) {
1154                 panic("rw lock count underflow for thread %p", thread);
1155         }
1156 #endif
1157         if ((prior_lock_state & LCK_RW_W_WAITING) &&
1158             ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1159                 /*
1160                  *      Someone else has requested upgrade.
1161                  *      Since we've released the read lock, wake
1162                  *      him up if he's blocked waiting
1163                  */
1164                 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1165         }
1166
1167         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1168                 /* sched_flags checked without lock, but will be rechecked while clearing */
1169                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1170         }
1171
1172         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1173             VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1174
1175         return FALSE;
1176 }
1177
1178 /*
1179  *      Routine:        lck_rw_lock_shared_to_exclusive_success
1180  *      Function:
1181  *              assembly fast path code has already dropped our read
1182  *              count and successfully acquired 'lck_rw_want_upgrade'
1183  *              we just need to wait for the rest of the readers to drain
1184  *              and then we can return as the exclusive holder of this lock
1185  */
1186 static boolean_t
1187 lck_rw_lock_shared_to_exclusive_success(
1188         lck_rw_t        *lock)
1189 {
1190         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1191         int                     slept = 0;
1192         lck_rw_word_t           word;
1193         wait_result_t           res;
1194         boolean_t               istate;
1195         boolean_t               not_shared;
1196
1197 #if     CONFIG_DTRACE
1198         uint64_t                wait_interval = 0;
1199         int                     readers_at_sleep = 0;
1200         boolean_t               dtrace_ls_initialized = FALSE;
1201         boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1202 #endif
1203
1204         while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE)) {
1205                 word.data = ordered_load_rw(lock);
1206 #if     CONFIG_DTRACE
1207                 if (dtrace_ls_initialized == FALSE) {
1208                         dtrace_ls_initialized = TRUE;
1209                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1210                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1211                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1212                         if (dtrace_ls_enabled) {
1213                                 /*
1214                                  * Either sleeping or spinning is happening,
1215                                  *  start a timing of our delay interval now.
1216                                  */
1217                                 readers_at_sleep = word.shared_count;
1218                                 wait_interval = mach_absolute_time();
1219                         }
1220                 }
1221 #endif
1222
1223                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1224                     trace_lck, word.shared_count, 0, 0, 0);
1225
1226                 not_shared = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE);
1227
1228                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1229                     trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1230
1231                 if (not_shared) {
1232                         break;
1233                 }
1234
1235                 /*
1236                  * if we get here, the spin deadline in lck_rw_wait_on_status()
1237                  * has expired w/o the rw_shared_count having drained to 0
1238                  * check to see if we're allowed to do a thread_block
1239                  */
1240                 if (word.can_sleep) {
1241                         istate = lck_interlock_lock(lock);
1242
1243                         word.data = ordered_load_rw(lock);
1244                         if (word.shared_count != 0) {
1245                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1246                                     trace_lck, word.shared_count, 0, 0, 0);
1247
1248                                 word.w_waiting = 1;
1249                                 ordered_store_rw(lock, word.data);
1250
1251                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1252                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1253                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1254                                 lck_interlock_unlock(lock, istate);
1255
1256                                 if (res == THREAD_WAITING) {
1257                                         res = thread_block(THREAD_CONTINUE_NULL);
1258                                         slept++;
1259                                 }
1260                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1261                                     trace_lck, res, slept, 0, 0);
1262                         } else {
1263                                 lck_interlock_unlock(lock, istate);
1264                                 break;
1265                         }
1266                 }
1267         }
1268 #if     CONFIG_DTRACE
1269         /*
1270          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1271          */
1272         if (dtrace_ls_enabled == TRUE) {
1273                 if (slept == 0) {
1274                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1275                 } else {
1276                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1277                             mach_absolute_time() - wait_interval, 1,
1278                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1279                 }
1280         }
1281         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1282 #endif
1283         return TRUE;
1284 }
1285
1286
1287 /*
1288  *      Routine:        lck_rw_lock_exclusive_to_shared
1289  */
1290
1291 void
1292 lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1293 {
1294         uint32_t        data, prev;
1295
1296         assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1297         ordered_store_rw_owner(lock, THREAD_NULL);
1298         for (;;) {
1299                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1300                 if (data & LCK_RW_INTERLOCK) {
1301                         atomic_exchange_abort();
1302                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1303                         continue;
1304                 }
1305                 data += LCK_RW_SHARED_READER;
1306                 if (data & LCK_RW_WANT_UPGRADE) {
1307                         data &= ~(LCK_RW_WANT_UPGRADE);
1308                 } else {
1309                         data &= ~(LCK_RW_WANT_EXCL);
1310                 }
1311                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1312                         data &= ~(LCK_RW_W_WAITING);
1313                 }
1314                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1315                         break;
1316                 }
1317                 cpu_pause();
1318         }
1319         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1320 }
1321
1322 /*
1323  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1324  *      Function:
1325  *              Fast path has already dropped
1326  *              our exclusive state and bumped lck_rw_shared_count
1327  *              all we need to do here is determine if anyone
1328  *              needs to be awakened.
1329  */
1330 static void
1331 lck_rw_lock_exclusive_to_shared_gen(
1332         lck_rw_t        *lck,
1333         uint32_t        prior_lock_state)
1334 {
1335         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1336         lck_rw_word_t   fake_lck;
1337
1338         /*
1339          * prior_lock state is a snapshot of the 1st word of the
1340          * lock in question... we'll fake up a pointer to it
1341          * and carefully not access anything beyond whats defined
1342          * in the first word of a lck_rw_t
1343          */
1344         fake_lck.data = prior_lock_state;
1345
1346         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1347             trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1348
1349         /*
1350          * don't wake up anyone waiting to take the lock exclusively
1351          * since we hold a read count... when the read count drops to 0,
1352          * the writers will be woken.
1353          *
1354          * wake up any waiting readers if we don't have any writers waiting,
1355          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1356          */
1357         if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1358                 thread_wakeup(LCK_RW_READER_EVENT(lck));
1359         }
1360
1361         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1362             trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1363
1364 #if CONFIG_DTRACE
1365         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1366 #endif
1367 }
1368
1369
1370 /*
1371  *      Routine:        lck_rw_try_lock
1372  */
1373 boolean_t
1374 lck_rw_try_lock(
1375         lck_rw_t                *lck,
1376         lck_rw_type_t   lck_rw_type)
1377 {
1378         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1379                 return lck_rw_try_lock_shared(lck);
1380         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1381                 return lck_rw_try_lock_exclusive(lck);
1382         } else {
1383                 panic("lck_rw_try_lock(): Invalid rw lock type: %x", lck_rw_type);
1384         }
1385         return FALSE;
1386 }
1387
1388 /*
1389  *      Routine:        lck_rw_try_lock_shared
1390  */
1391
1392 boolean_t
1393 lck_rw_try_lock_shared(lck_rw_t *lock)
1394 {
1395         uint32_t        data, prev;
1396
1397         for (;;) {
1398                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1399                 if (data & LCK_RW_INTERLOCK) {
1400                         atomic_exchange_abort();
1401                         lck_rw_interlock_spin(lock);
1402                         continue;
1403                 }
1404                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1405                         atomic_exchange_abort();
1406                         return FALSE;                                           /* lock is busy */
1407                 }
1408                 data += LCK_RW_SHARED_READER;                   /* Increment reader refcount */
1409                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1410                         break;
1411                 }
1412                 cpu_pause();
1413         }
1414 #if MACH_ASSERT
1415         thread_t owner = ordered_load_rw_owner(lock);
1416         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1417 #endif
1418
1419         if (lock->lck_rw_can_sleep) {
1420                 current_thread()->rwlock_count++;
1421         } else if (get_preemption_level() == 0) {
1422                 panic("Taking non-sleepable RW lock with preemption enabled");
1423         }
1424
1425 #if     CONFIG_DTRACE
1426         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1427 #endif  /* CONFIG_DTRACE */
1428         return TRUE;
1429 }
1430
1431
1432 /*
1433  *      Routine:        lck_rw_try_lock_exclusive
1434  */
1435
1436 boolean_t
1437 lck_rw_try_lock_exclusive(lck_rw_t *lock)
1438 {
1439         uint32_t        data, prev;
1440         thread_t        thread;
1441
1442         for (;;) {
1443                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1444                 if (data & LCK_RW_INTERLOCK) {
1445                         atomic_exchange_abort();
1446                         lck_rw_interlock_spin(lock);
1447                         continue;
1448                 }
1449                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1450                         atomic_exchange_abort();
1451                         return FALSE;
1452                 }
1453                 data |= LCK_RW_WANT_EXCL;
1454                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1455                         break;
1456                 }
1457                 cpu_pause();
1458         }
1459         thread = current_thread();
1460         if (lock->lck_rw_can_sleep) {
1461                 thread->rwlock_count++;
1462         } else if (get_preemption_level() == 0) {
1463                 panic("Taking non-sleepable RW lock with preemption enabled");
1464         }
1465 #if MACH_ASSERT
1466         thread_t owner = ordered_load_rw_owner(lock);
1467         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1468 #endif
1469         ordered_store_rw_owner(lock, thread);
1470 #if     CONFIG_DTRACE
1471         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1472 #endif  /* CONFIG_DTRACE */
1473         return TRUE;
1474 }
1475
1476
1477 /*
1478  *      Routine:        lck_rw_unlock
1479  */
1480 void
1481 lck_rw_unlock(
1482         lck_rw_t                *lck,
1483         lck_rw_type_t   lck_rw_type)
1484 {
1485         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1486                 lck_rw_unlock_shared(lck);
1487         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1488                 lck_rw_unlock_exclusive(lck);
1489         } else {
1490                 panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
1491         }
1492 }
1493
1494
1495 /*
1496  *      Routine:        lck_rw_unlock_shared
1497  */
1498 void
1499 lck_rw_unlock_shared(
1500         lck_rw_t        *lck)
1501 {
1502         lck_rw_type_t   ret;
1503
1504         assertf(lck->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1505         assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
1506         ret = lck_rw_done(lck);
1507
1508         if (ret != LCK_RW_TYPE_SHARED) {
1509                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
1510         }
1511 }
1512
1513
1514 /*
1515  *      Routine:        lck_rw_unlock_exclusive
1516  */
1517 void
1518 lck_rw_unlock_exclusive(
1519         lck_rw_t        *lck)
1520 {
1521         lck_rw_type_t   ret;
1522
1523         assertf(lck->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1524         ret = lck_rw_done(lck);
1525
1526         if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1527                 panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
1528         }
1529 }
1530
1531
1532 /*
1533  *      Routine:        lck_rw_lock_exclusive_gen
1534  */
1535 static void
1536 lck_rw_lock_exclusive_gen(
1537         lck_rw_t        *lock)
1538 {
1539         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1540         lck_rw_word_t           word;
1541         int                     slept = 0;
1542         boolean_t               gotlock = 0;
1543         boolean_t               not_shared_or_upgrade = 0;
1544         wait_result_t           res = 0;
1545         boolean_t               istate;
1546
1547 #if     CONFIG_DTRACE
1548         boolean_t dtrace_ls_initialized = FALSE;
1549         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1550         uint64_t wait_interval = 0;
1551         int readers_at_sleep = 0;
1552 #endif
1553
1554         /*
1555          *      Try to acquire the lck_rw_want_excl bit.
1556          */
1557         while (!lck_rw_grab(lock, LCK_RW_GRAB_WANT, FALSE)) {
1558 #if     CONFIG_DTRACE
1559                 if (dtrace_ls_initialized == FALSE) {
1560                         dtrace_ls_initialized = TRUE;
1561                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1562                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1563                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1564                         if (dtrace_ls_enabled) {
1565                                 /*
1566                                  * Either sleeping or spinning is happening,
1567                                  *  start a timing of our delay interval now.
1568                                  */
1569                                 readers_at_sleep = lock->lck_rw_shared_count;
1570                                 wait_interval = mach_absolute_time();
1571                         }
1572                 }
1573 #endif
1574
1575                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1576
1577                 gotlock = lck_rw_grab(lock, LCK_RW_GRAB_WANT, TRUE);
1578
1579                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1580
1581                 if (gotlock) {
1582                         break;
1583                 }
1584                 /*
1585                  * if we get here, the deadline has expired w/o us
1586                  * being able to grab the lock exclusively
1587                  * check to see if we're allowed to do a thread_block
1588                  */
1589                 word.data = ordered_load_rw(lock);
1590                 if (word.can_sleep) {
1591                         istate = lck_interlock_lock(lock);
1592                         word.data = ordered_load_rw(lock);
1593
1594                         if (word.want_excl) {
1595                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1596
1597                                 word.w_waiting = 1;
1598                                 ordered_store_rw(lock, word.data);
1599
1600                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1601                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1602                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1603                                 lck_interlock_unlock(lock, istate);
1604
1605                                 if (res == THREAD_WAITING) {
1606                                         res = thread_block(THREAD_CONTINUE_NULL);
1607                                         slept++;
1608                                 }
1609                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1610                         } else {
1611                                 word.want_excl = 1;
1612                                 ordered_store_rw(lock, word.data);
1613                                 lck_interlock_unlock(lock, istate);
1614                                 break;
1615                         }
1616                 }
1617         }
1618         /*
1619          * Wait for readers (and upgrades) to finish...
1620          */
1621         while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE)) {
1622 #if     CONFIG_DTRACE
1623                 /*
1624                  * Either sleeping or spinning is happening, start
1625                  * a timing of our delay interval now.  If we set it
1626                  * to -1 we don't have accurate data so we cannot later
1627                  * decide to record a dtrace spin or sleep event.
1628                  */
1629                 if (dtrace_ls_initialized == FALSE) {
1630                         dtrace_ls_initialized = TRUE;
1631                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1632                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1633                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1634                         if (dtrace_ls_enabled) {
1635                                 /*
1636                                  * Either sleeping or spinning is happening,
1637                                  *  start a timing of our delay interval now.
1638                                  */
1639                                 readers_at_sleep = lock->lck_rw_shared_count;
1640                                 wait_interval = mach_absolute_time();
1641                         }
1642                 }
1643 #endif
1644
1645                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1646
1647                 not_shared_or_upgrade = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE);
1648
1649                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, not_shared_or_upgrade, 0);
1650
1651                 if (not_shared_or_upgrade) {
1652                         break;
1653                 }
1654                 /*
1655                  * if we get here, the deadline has expired w/o us
1656                  * being able to grab the lock exclusively
1657                  * check to see if we're allowed to do a thread_block
1658                  */
1659                 word.data = ordered_load_rw(lock);
1660                 if (word.can_sleep) {
1661                         istate = lck_interlock_lock(lock);
1662                         word.data = ordered_load_rw(lock);
1663
1664                         if (word.shared_count != 0 || word.want_upgrade) {
1665                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1666
1667                                 word.w_waiting = 1;
1668                                 ordered_store_rw(lock, word.data);
1669
1670                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1671                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1672                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1673                                 lck_interlock_unlock(lock, istate);
1674
1675                                 if (res == THREAD_WAITING) {
1676                                         res = thread_block(THREAD_CONTINUE_NULL);
1677                                         slept++;
1678                                 }
1679                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1680                         } else {
1681                                 lck_interlock_unlock(lock, istate);
1682                                 /*
1683                                  * must own the lock now, since we checked for
1684                                  * readers or upgrade owner behind the interlock
1685                                  * no need for a call to 'lck_rw_drain_status'
1686                                  */
1687                                 break;
1688                         }
1689                 }
1690         }
1691
1692 #if     CONFIG_DTRACE
1693         /*
1694          * Decide what latencies we suffered that are Dtrace events.
1695          * If we have set wait_interval, then we either spun or slept.
1696          * At least we get out from under the interlock before we record
1697          * which is the best we can do here to minimize the impact
1698          * of the tracing.
1699          * If we have set wait_interval to -1, then dtrace was not enabled when we
1700          * started sleeping/spinning so we don't record this event.
1701          */
1702         if (dtrace_ls_enabled == TRUE) {
1703                 if (slept == 0) {
1704                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1705                             mach_absolute_time() - wait_interval, 1);
1706                 } else {
1707                         /*
1708                          * For the blocking case, we also record if when we blocked
1709                          * it was held for read or write, and how many readers.
1710                          * Notice that above we recorded this before we dropped
1711                          * the interlock so the count is accurate.
1712                          */
1713                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1714                             mach_absolute_time() - wait_interval, 1,
1715                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1716                 }
1717         }
1718         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1719 #endif  /* CONFIG_DTRACE */
1720 }
1721
1722 /*
1723  *      Routine:        lck_rw_done
1724  */
1725
1726 lck_rw_type_t
1727 lck_rw_done(lck_rw_t *lock)
1728 {
1729         uint32_t        data, prev;
1730         boolean_t       once = FALSE;
1731
1732         for (;;) {
1733                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1734                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1735                         atomic_exchange_abort();
1736                         lck_rw_interlock_spin(lock);
1737                         continue;
1738                 }
1739                 if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
1740                         assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1741                         data -= LCK_RW_SHARED_READER;
1742                         if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1743                                 goto check_waiters;
1744                         }
1745                 } else {                                        /* if reader count == 0, must be exclusive lock */
1746                         if (data & LCK_RW_WANT_UPGRADE) {
1747                                 data &= ~(LCK_RW_WANT_UPGRADE);
1748                         } else {
1749                                 if (data & LCK_RW_WANT_EXCL) {
1750                                         data &= ~(LCK_RW_WANT_EXCL);
1751                                 } else {                                /* lock is not 'owned', panic */
1752                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1753                                 }
1754                         }
1755                         if (!once) {
1756                                 // Only check for holder and clear it once
1757                                 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1758                                 ordered_store_rw_owner(lock, THREAD_NULL);
1759                                 once = TRUE;
1760                         }
1761 check_waiters:
1762                         /*
1763                          * test the original values to match what
1764                          * lck_rw_done_gen is going to do to determine
1765                          * which wakeups need to happen...
1766                          *
1767                          * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
1768                          */
1769                         if (prev & LCK_RW_W_WAITING) {
1770                                 data &= ~(LCK_RW_W_WAITING);
1771                                 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1772                                         data &= ~(LCK_RW_R_WAITING);
1773                                 }
1774                         } else {
1775                                 data &= ~(LCK_RW_R_WAITING);
1776                         }
1777                 }
1778                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1779                         break;
1780                 }
1781                 cpu_pause();
1782         }
1783         return lck_rw_done_gen(lock, prev);
1784 }
1785
1786 /*
1787  *      Routine:        lck_rw_done_gen
1788  *
1789  *      called from the assembly language wrapper...
1790  *      prior_lock_state is the value in the 1st
1791  *      word of the lock at the time of a successful
1792  *      atomic compare and exchange with the new value...
1793  *      it represents the state of the lock before we
1794  *      decremented the rw_shared_count or cleared either
1795  *      rw_want_upgrade or rw_want_write and
1796  *      the lck_x_waiting bits...  since the wrapper
1797  *      routine has already changed the state atomically,
1798  *      we just need to decide if we should
1799  *      wake up anyone and what value to return... we do
1800  *      this by examining the state of the lock before
1801  *      we changed it
1802  */
1803 static lck_rw_type_t
1804 lck_rw_done_gen(
1805         lck_rw_t        *lck,
1806         uint32_t        prior_lock_state)
1807 {
1808         lck_rw_word_t   fake_lck;
1809         lck_rw_type_t   lock_type;
1810         thread_t                thread;
1811         uint32_t                rwlock_count;
1812
1813         /*
1814          * prior_lock state is a snapshot of the 1st word of the
1815          * lock in question... we'll fake up a pointer to it
1816          * and carefully not access anything beyond whats defined
1817          * in the first word of a lck_rw_t
1818          */
1819         fake_lck.data = prior_lock_state;
1820
1821         if (fake_lck.shared_count <= 1) {
1822                 if (fake_lck.w_waiting) {
1823                         thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1824                 }
1825
1826                 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1827                         thread_wakeup(LCK_RW_READER_EVENT(lck));
1828                 }
1829         }
1830         if (fake_lck.shared_count) {
1831                 lock_type = LCK_RW_TYPE_SHARED;
1832         } else {
1833                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1834         }
1835
1836         /* Check if dropping the lock means that we need to unpromote */
1837         thread = current_thread();
1838         if (fake_lck.can_sleep) {
1839                 rwlock_count = thread->rwlock_count--;
1840         } else {
1841                 rwlock_count = UINT32_MAX;
1842         }
1843 #if MACH_LDEBUG
1844         if (rwlock_count == 0) {
1845                 panic("rw lock count underflow for thread %p", thread);
1846         }
1847 #endif
1848         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1849                 /* sched_flags checked without lock, but will be rechecked while clearing */
1850                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1851         }
1852 #if CONFIG_DTRACE
1853         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1854 #endif
1855         return lock_type;
1856 }
1857
1858 /*
1859  *      Routine:        lck_rw_lock_shared_gen
1860  *      Function:
1861  *              Fast path code has determined that this lock
1862  *              is held exclusively... this is where we spin/block
1863  *              until we can acquire the lock in the shared mode
1864  */
1865 static void
1866 lck_rw_lock_shared_gen(
1867         lck_rw_t        *lck)
1868 {
1869         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1870         lck_rw_word_t           word;
1871         boolean_t               gotlock = 0;
1872         int                     slept = 0;
1873         wait_result_t           res = 0;
1874         boolean_t               istate;
1875
1876 #if     CONFIG_DTRACE
1877         uint64_t wait_interval = 0;
1878         int readers_at_sleep = 0;
1879         boolean_t dtrace_ls_initialized = FALSE;
1880         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1881 #endif /* CONFIG_DTRACE */
1882
1883         while (!lck_rw_grab(lck, LCK_RW_GRAB_SHARED, FALSE)) {
1884 #if     CONFIG_DTRACE
1885                 if (dtrace_ls_initialized == FALSE) {
1886                         dtrace_ls_initialized = TRUE;
1887                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1888                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1889                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1890                         if (dtrace_ls_enabled) {
1891                                 /*
1892                                  * Either sleeping or spinning is happening,
1893                                  *  start a timing of our delay interval now.
1894                                  */
1895                                 readers_at_sleep = lck->lck_rw_shared_count;
1896                                 wait_interval = mach_absolute_time();
1897                         }
1898                 }
1899 #endif
1900
1901                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1902                     trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1903
1904                 gotlock = lck_rw_grab(lck, LCK_RW_GRAB_SHARED, TRUE);
1905
1906                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1907                     trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, gotlock, 0);
1908
1909                 if (gotlock) {
1910                         break;
1911                 }
1912                 /*
1913                  * if we get here, the deadline has expired w/o us
1914                  * being able to grab the lock for read
1915                  * check to see if we're allowed to do a thread_block
1916                  */
1917                 if (lck->lck_rw_can_sleep) {
1918                         istate = lck_interlock_lock(lck);
1919
1920                         word.data = ordered_load_rw(lck);
1921                         if ((word.want_excl || word.want_upgrade) &&
1922                             ((word.shared_count == 0) || word.priv_excl)) {
1923                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1924                                     trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1925
1926                                 word.r_waiting = 1;
1927                                 ordered_store_rw(lck, word.data);
1928
1929                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1930                                 res = assert_wait(LCK_RW_READER_EVENT(lck),
1931                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1932                                 lck_interlock_unlock(lck, istate);
1933
1934                                 if (res == THREAD_WAITING) {
1935                                         res = thread_block(THREAD_CONTINUE_NULL);
1936                                         slept++;
1937                                 }
1938                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1939                                     trace_lck, res, slept, 0, 0);
1940                         } else {
1941                                 word.shared_count++;
1942                                 ordered_store_rw(lck, word.data);
1943                                 lck_interlock_unlock(lck, istate);
1944                                 break;
1945                         }
1946                 }
1947         }
1948
1949 #if     CONFIG_DTRACE
1950         if (dtrace_ls_enabled == TRUE) {
1951                 if (slept == 0) {
1952                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1953                 } else {
1954                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1955                             mach_absolute_time() - wait_interval, 0,
1956                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1957                 }
1958         }
1959         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1960 #endif  /* CONFIG_DTRACE */
1961 }
1962
1963 /*
1964  * Required to verify thread ownership for exclusive locks by virtue of PPL
1965  * usage
1966  */
1967 void
1968 lck_rw_assert(
1969         lck_rw_t                *lck,
1970         unsigned int    type)
1971 {
1972         switch (type) {
1973         case LCK_RW_ASSERT_SHARED:
1974                 if ((lck->lck_rw_shared_count != 0) &&
1975                     (lck->lck_rw_owner == THREAD_NULL)) {
1976                         return;
1977                 }
1978                 break;
1979         case LCK_RW_ASSERT_EXCLUSIVE:
1980                 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1981                     (lck->lck_rw_shared_count == 0) &&
1982                     (lck->lck_rw_owner == current_thread())) {
1983                         return;
1984                 }
1985                 break;
1986         case LCK_RW_ASSERT_HELD:
1987                 if (lck->lck_rw_shared_count != 0) {
1988                         return;         // Held shared
1989                 }
1990                 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1991                     (lck->lck_rw_owner == current_thread())) {
1992                         return;         // Held exclusive
1993                 }
1994                 break;
1995         case LCK_RW_ASSERT_NOTHELD:
1996                 if ((lck->lck_rw_shared_count == 0) &&
1997                     !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1998                     (lck->lck_rw_owner == THREAD_NULL)) {
1999                         return;
2000                 }
2001                 break;
2002         default:
2003                 break;
2004         }
2005         panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2006 }
2007
2008
2009 /*
2010  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2011  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2012  */
2013 boolean_t
2014 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2015 {
2016         if (not_in_kdp) {
2017                 panic("panic: rw lock exclusive check done outside of kernel debugger");
2018         }
2019         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2020 }
2021
2022 /*
2023  * The C portion of the mutex package.  These routines are only invoked
2024  * if the optimized assembler routines can't do the work.
2025  */
2026
2027 /*
2028  * Forward declaration
2029  */
2030
2031 void
2032 lck_mtx_ext_init(
2033         lck_mtx_ext_t * lck,
2034         lck_grp_t * grp,
2035         lck_attr_t * attr);
2036
2037 /*
2038  *      Routine:        lck_mtx_alloc_init
2039  */
2040 lck_mtx_t      *
2041 lck_mtx_alloc_init(
2042         lck_grp_t * grp,
2043         lck_attr_t * attr)
2044 {
2045         lck_mtx_t      *lck;
2046
2047         lck = zalloc(ZV_LCK_MTX);
2048         lck_mtx_init(lck, grp, attr);
2049         return lck;
2050 }
2051
2052 /*
2053  *      Routine:        lck_mtx_free
2054  */
2055 void
2056 lck_mtx_free(
2057         lck_mtx_t * lck,
2058         lck_grp_t * grp)
2059 {
2060         lck_mtx_destroy(lck, grp);
2061         zfree(ZV_LCK_MTX, lck);
2062 }
2063
2064 /*
2065  *      Routine:        lck_mtx_init
2066  */
2067 void
2068 lck_mtx_init(
2069         lck_mtx_t * lck,
2070         lck_grp_t * grp,
2071         lck_attr_t * attr)
2072 {
2073 #ifdef  BER_XXX
2074         lck_mtx_ext_t  *lck_ext;
2075 #endif
2076         lck_attr_t     *lck_attr;
2077
2078         if (attr != LCK_ATTR_NULL) {
2079                 lck_attr = attr;
2080         } else {
2081                 lck_attr = &LockDefaultLckAttr;
2082         }
2083
2084 #ifdef  BER_XXX
2085         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2086                 lck_ext = zalloc(ZV_LCK_MTX_EXT);
2087                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2088                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2089                 lck->lck_mtx_ptr = lck_ext;
2090                 lck->lck_mtx_type = LCK_MTX_TYPE;
2091         } else
2092 #endif
2093         {
2094                 lck->lck_mtx_ptr = NULL;                // Clear any padding in the union fields below
2095                 lck->lck_mtx_waiters = 0;
2096                 lck->lck_mtx_type = LCK_MTX_TYPE;
2097                 ordered_store_mtx(lck, 0);
2098         }
2099         lck_grp_reference(grp);
2100         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2101 }
2102
2103 /*
2104  *      Routine:        lck_mtx_init_ext
2105  */
2106 void
2107 lck_mtx_init_ext(
2108         lck_mtx_t * lck,
2109         lck_mtx_ext_t * lck_ext,
2110         lck_grp_t * grp,
2111         lck_attr_t * attr)
2112 {
2113         lck_attr_t     *lck_attr;
2114
2115         if (attr != LCK_ATTR_NULL) {
2116                 lck_attr = attr;
2117         } else {
2118                 lck_attr = &LockDefaultLckAttr;
2119         }
2120
2121         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2122                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2123                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2124                 lck->lck_mtx_ptr = lck_ext;
2125                 lck->lck_mtx_type = LCK_MTX_TYPE;
2126         } else {
2127                 lck->lck_mtx_waiters = 0;
2128                 lck->lck_mtx_type = LCK_MTX_TYPE;
2129                 ordered_store_mtx(lck, 0);
2130         }
2131         lck_grp_reference(grp);
2132         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2133 }
2134
2135 /*
2136  *      Routine:        lck_mtx_ext_init
2137  */
2138 void
2139 lck_mtx_ext_init(
2140         lck_mtx_ext_t * lck,
2141         lck_grp_t * grp,
2142         lck_attr_t * attr)
2143 {
2144         bzero((void *) lck, sizeof(lck_mtx_ext_t));
2145
2146         lck->lck_mtx.lck_mtx_type = LCK_MTX_TYPE;
2147
2148         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2149                 lck->lck_mtx_deb.type = MUTEX_TAG;
2150                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2151         }
2152         lck->lck_mtx_grp = grp;
2153
2154         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2155                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2156         }
2157 }
2158
2159 /* The slow versions */
2160 static void lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2161 static boolean_t lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread);
2162 static void lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2163
2164 /* The adaptive spin function */
2165 static spinwait_result_t lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2166
2167 /*
2168  *      Routine:        lck_mtx_verify
2169  *
2170  *      Verify if a mutex is valid
2171  */
2172 static inline void
2173 lck_mtx_verify(lck_mtx_t *lock)
2174 {
2175         if (lock->lck_mtx_type != LCK_MTX_TYPE) {
2176                 panic("Invalid mutex %p", lock);
2177         }
2178 #if     DEVELOPMENT || DEBUG
2179         if (lock->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2180                 panic("Mutex destroyed %p", lock);
2181         }
2182 #endif  /* DEVELOPMENT || DEBUG */
2183 }
2184
2185 /*
2186  *      Routine:        lck_mtx_check_preemption
2187  *
2188  *      Verify preemption is enabled when attempting to acquire a mutex.
2189  */
2190
2191 static inline void
2192 lck_mtx_check_preemption(lck_mtx_t *lock)
2193 {
2194 #if     DEVELOPMENT || DEBUG
2195         if (current_cpu_datap()->cpu_hibernate) {
2196                 return;
2197         }
2198
2199         int pl = get_preemption_level();
2200
2201         if (pl != 0) {
2202                 panic("Attempt to take mutex with preemption disabled. Lock=%p, level=%d", lock, pl);
2203         }
2204 #else
2205         (void)lock;
2206 #endif
2207 }
2208
2209 /*
2210  *      Routine:        lck_mtx_lock
2211  */
2212 void
2213 lck_mtx_lock(lck_mtx_t *lock)
2214 {
2215         thread_t        thread;
2216
2217         lck_mtx_verify(lock);
2218         lck_mtx_check_preemption(lock);
2219         thread = current_thread();
2220         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2221             0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2222 #if     CONFIG_DTRACE
2223                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2224 #endif /* CONFIG_DTRACE */
2225                 return;
2226         }
2227         lck_mtx_lock_contended(lock, thread, FALSE);
2228 }
2229
2230 /*
2231  *       This is the slow version of mutex locking.
2232  */
2233 static void NOINLINE
2234 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2235 {
2236         thread_t                holding_thread;
2237         uintptr_t               state;
2238         int                     waiters = 0;
2239         spinwait_result_t       sw_res;
2240         struct turnstile        *ts = NULL;
2241
2242         /* Loop waiting until I see that the mutex is unowned */
2243         for (;;) {
2244                 sw_res = lck_mtx_lock_contended_spinwait_arm(lock, thread, interlocked);
2245                 interlocked = FALSE;
2246
2247                 switch (sw_res) {
2248                 case SPINWAIT_ACQUIRED:
2249                         if (ts != NULL) {
2250                                 interlock_lock(lock);
2251                                 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2252                                 interlock_unlock(lock);
2253                         }
2254                         goto done;
2255                 case SPINWAIT_INTERLOCK:
2256                         goto set_owner;
2257                 default:
2258                         break;
2259                 }
2260
2261                 state = ordered_load_mtx(lock);
2262                 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2263                 if (holding_thread == NULL) {
2264                         break;
2265                 }
2266                 ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
2267                 lck_mtx_lock_wait(lock, holding_thread, &ts);
2268                 /* returns interlock unlocked */
2269         }
2270
2271 set_owner:
2272         /* Hooray, I'm the new owner! */
2273         state = ordered_load_mtx(lock);
2274
2275         if (state & ARM_LCK_WAITERS) {
2276                 /* Skip lck_mtx_lock_acquire if there are no waiters. */
2277                 waiters = lck_mtx_lock_acquire(lock, ts);
2278                 /*
2279                  * lck_mtx_lock_acquire will call
2280                  * turnstile_complete
2281                  */
2282         } else {
2283                 if (ts != NULL) {
2284                         turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2285                 }
2286         }
2287
2288         state = LCK_MTX_THREAD_TO_STATE(thread);
2289         if (waiters != 0) {
2290                 state |= ARM_LCK_WAITERS;
2291         }
2292         state |= LCK_ILOCK;                             // Preserve interlock
2293         ordered_store_mtx(lock, state); // Set ownership
2294         interlock_unlock(lock);                 // Release interlock, enable preemption
2295
2296 done:
2297         load_memory_barrier();
2298
2299         assert(thread->turnstile != NULL);
2300
2301         if (ts != NULL) {
2302                 turnstile_cleanup();
2303         }
2304
2305 #if CONFIG_DTRACE
2306         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2307 #endif /* CONFIG_DTRACE */
2308 }
2309
2310 /*
2311  * Routine: lck_mtx_lock_spinwait_arm
2312  *
2313  * Invoked trying to acquire a mutex when there is contention but
2314  * the holder is running on another processor. We spin for up to a maximum
2315  * time waiting for the lock to be released.
2316  */
2317 static spinwait_result_t
2318 lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2319 {
2320         int                     has_interlock = (int)interlocked;
2321         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
2322         thread_t        owner, prev_owner;
2323         uint64_t        window_deadline, sliding_deadline, high_deadline;
2324         uint64_t        start_time, cur_time, avg_hold_time, bias, delta;
2325         int             loopcount = 0;
2326         uint            i, prev_owner_cpu;
2327         int             total_hold_time_samples, window_hold_time_samples, unfairness;
2328         bool            owner_on_core, adjust;
2329         uintptr_t       state, new_state, waiters;
2330         spinwait_result_t       retval = SPINWAIT_DID_SPIN_HIGH_THR;
2331
2332         if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) {
2333                 if (!has_interlock) {
2334                         interlock_lock(lock);
2335                 }
2336
2337                 return SPINWAIT_DID_NOT_SPIN;
2338         }
2339
2340         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2341             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0);
2342
2343         start_time = mach_absolute_time();
2344         /*
2345          * window_deadline represents the "learning" phase.
2346          * The thread collects statistics about the lock during
2347          * window_deadline and then it makes a decision on whether to spin more
2348          * or block according to the concurrency behavior
2349          * observed.
2350          *
2351          * Every thread can spin at least low_MutexSpin.
2352          */
2353         window_deadline = start_time + low_MutexSpin;
2354         /*
2355          * Sliding_deadline is the adjusted spin deadline
2356          * computed after the "learning" phase.
2357          */
2358         sliding_deadline = window_deadline;
2359         /*
2360          * High_deadline is a hard deadline. No thread
2361          * can spin more than this deadline.
2362          */
2363         if (high_MutexSpin >= 0) {
2364                 high_deadline = start_time + high_MutexSpin;
2365         } else {
2366                 high_deadline = start_time + low_MutexSpin * real_ncpus;
2367         }
2368
2369         /*
2370          * Do not know yet which is the owner cpu.
2371          * Initialize prev_owner_cpu with next cpu.
2372          */
2373         prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
2374         total_hold_time_samples = 0;
2375         window_hold_time_samples = 0;
2376         avg_hold_time = 0;
2377         adjust = TRUE;
2378         bias = (os_hash_kernel_pointer(lock) + cpu_number()) % real_ncpus;
2379
2380         /* Snoop the lock state */
2381         state = ordered_load_mtx(lock);
2382         owner = LCK_MTX_STATE_TO_THREAD(state);
2383         prev_owner = owner;
2384
2385         if (has_interlock) {
2386                 if (owner == NULL) {
2387                         retval = SPINWAIT_INTERLOCK;
2388                         goto done_spinning;
2389                 } else {
2390                         /*
2391                          * We are holding the interlock, so
2392                          * we can safely dereference owner.
2393                          */
2394                         if (!machine_thread_on_core(owner) || (owner->state & TH_IDLE)) {
2395                                 retval = SPINWAIT_DID_NOT_SPIN;
2396                                 goto done_spinning;
2397                         }
2398                 }
2399                 interlock_unlock(lock);
2400                 has_interlock = 0;
2401         }
2402
2403         /*
2404          * Spin while:
2405          *   - mutex is locked, and
2406          *   - it's locked as a spin lock, and
2407          *   - owner is running on another processor, and
2408          *   - we haven't spun for long enough.
2409          */
2410         do {
2411                 /*
2412                  * Try to acquire the lock.
2413                  */
2414                 owner = LCK_MTX_STATE_TO_THREAD(state);
2415                 if (owner == NULL) {
2416                         waiters = state & ARM_LCK_WAITERS;
2417                         if (waiters) {
2418                                 /*
2419                                  * preserve the waiter bit
2420                                  * and try acquire the interlock.
2421                                  * Note: we will successfully acquire
2422                                  * the interlock only if we can also
2423                                  * acquire the lock.
2424                                  */
2425                                 new_state = ARM_LCK_WAITERS | LCK_ILOCK;
2426                                 has_interlock = 1;
2427                                 retval = SPINWAIT_INTERLOCK;
2428                                 disable_preemption();
2429                         } else {
2430                                 new_state = LCK_MTX_THREAD_TO_STATE(thread);
2431                                 retval = SPINWAIT_ACQUIRED;
2432                         }
2433
2434                         /*
2435                          * The cmpxchg will succed only if the lock
2436                          * is not owned (doesn't have an owner set)
2437                          * and it is not interlocked.
2438                          * It will not fail if there are waiters.
2439                          */
2440                         if (os_atomic_cmpxchgv(&lock->lck_mtx_data,
2441                             waiters, new_state, &state, acquire)) {
2442                                 goto done_spinning;
2443                         } else {
2444                                 if (waiters) {
2445                                         has_interlock = 0;
2446                                         enable_preemption();
2447                                 }
2448                         }
2449                 }
2450
2451                 cur_time = mach_absolute_time();
2452
2453                 /*
2454                  * Never spin past high_deadline.
2455                  */
2456                 if (cur_time >= high_deadline) {
2457                         retval = SPINWAIT_DID_SPIN_HIGH_THR;
2458                         break;
2459                 }
2460
2461                 /*
2462                  * Check if owner is on core. If not block.
2463                  */
2464                 owner = LCK_MTX_STATE_TO_THREAD(state);
2465                 if (owner) {
2466                         i = prev_owner_cpu;
2467                         owner_on_core = FALSE;
2468
2469                         disable_preemption();
2470                         state = ordered_load_mtx(lock);
2471                         owner = LCK_MTX_STATE_TO_THREAD(state);
2472
2473                         /*
2474                          * For scalability we want to check if the owner is on core
2475                          * without locking the mutex interlock.
2476                          * If we do not lock the mutex interlock, the owner that we see might be
2477                          * invalid, so we cannot dereference it. Therefore we cannot check
2478                          * any field of the thread to tell us if it is on core.
2479                          * Check if the thread that is running on the other cpus matches the owner.
2480                          */
2481                         if (owner) {
2482                                 do {
2483                                         cpu_data_t *cpu_data_ptr = CpuDataEntries[i].cpu_data_vaddr;
2484                                         if ((cpu_data_ptr != NULL) && (cpu_data_ptr->cpu_active_thread == owner)) {
2485                                                 owner_on_core = TRUE;
2486                                                 break;
2487                                         }
2488                                         if (++i >= real_ncpus) {
2489                                                 i = 0;
2490                                         }
2491                                 } while (i != prev_owner_cpu);
2492                                 enable_preemption();
2493
2494                                 if (owner_on_core) {
2495                                         prev_owner_cpu = i;
2496                                 } else {
2497                                         prev_owner = owner;
2498                                         state = ordered_load_mtx(lock);
2499                                         owner = LCK_MTX_STATE_TO_THREAD(state);
2500                                         if (owner == prev_owner) {
2501                                                 /*
2502                                                  * Owner is not on core.
2503                                                  * Stop spinning.
2504                                                  */
2505                                                 if (loopcount == 0) {
2506                                                         retval = SPINWAIT_DID_NOT_SPIN;
2507                                                 } else {
2508                                                         retval = SPINWAIT_DID_SPIN_OWNER_NOT_CORE;
2509                                                 }
2510                                                 break;
2511                                         }
2512                                         /*
2513                                          * Fall through if the owner changed while we were scanning.
2514                                          * The new owner could potentially be on core, so loop
2515                                          * again.
2516                                          */
2517                                 }
2518                         } else {
2519                                 enable_preemption();
2520                         }
2521                 }
2522
2523                 /*
2524                  * Save how many times we see the owner changing.
2525                  * We can roughly estimate the the mutex hold
2526                  * time and the fairness with that.
2527                  */
2528                 if (owner != prev_owner) {
2529                         prev_owner = owner;
2530                         total_hold_time_samples++;
2531                         window_hold_time_samples++;
2532                 }
2533
2534                 /*
2535                  * Learning window expired.
2536                  * Try to adjust the sliding_deadline.
2537                  */
2538                 if (cur_time >= window_deadline) {
2539                         /*
2540                          * If there was not contention during the window
2541                          * stop spinning.
2542                          */
2543                         if (window_hold_time_samples < 1) {
2544                                 retval = SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION;
2545                                 break;
2546                         }
2547
2548                         if (adjust) {
2549                                 /*
2550                                  * For a fair lock, we'd wait for at most (NCPU-1) periods,
2551                                  * but the lock is unfair, so let's try to estimate by how much.
2552                                  */
2553                                 unfairness = total_hold_time_samples / real_ncpus;
2554
2555                                 if (unfairness == 0) {
2556                                         /*
2557                                          * We observed the owner changing `total_hold_time_samples` times which
2558                                          * let us estimate the average hold time of this mutex for the duration
2559                                          * of the spin time.
2560                                          * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
2561                                          *
2562                                          * In this case spin at max avg_hold_time * (real_ncpus - 1)
2563                                          */
2564                                         delta = cur_time - start_time;
2565                                         sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
2566                                 } else {
2567                                         /*
2568                                          * In this case at least one of the other cpus was able to get the lock twice
2569                                          * while I was spinning.
2570                                          * We could spin longer but it won't necessarily help if the system is unfair.
2571                                          * Try to randomize the wait to reduce contention.
2572                                          *
2573                                          * We compute how much time we could potentially spin
2574                                          * and distribute it over the cpus.
2575                                          *
2576                                          * bias is an integer between 0 and real_ncpus.
2577                                          * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
2578                                          */
2579                                         delta = high_deadline - cur_time;
2580                                         sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
2581                                         adjust = FALSE;
2582                                 }
2583                         }
2584
2585                         window_deadline += low_MutexSpin;
2586                         window_hold_time_samples = 0;
2587                 }
2588
2589                 /*
2590                  * Stop spinning if we past
2591                  * the adjusted deadline.
2592                  */
2593                 if (cur_time >= sliding_deadline) {
2594                         retval = SPINWAIT_DID_SPIN_SLIDING_THR;
2595                         break;
2596                 }
2597
2598                 /*
2599                  * We want to arm the monitor for wfe,
2600                  * so load exclusively the lock.
2601                  *
2602                  * NOTE:
2603                  * we rely on the fact that wfe will
2604                  * eventually return even if the cache line
2605                  * is not modified. This way we will keep
2606                  * looping and checking if the deadlines expired.
2607                  */
2608                 state = os_atomic_load_exclusive(&lock->lck_mtx_data, relaxed);
2609                 owner = LCK_MTX_STATE_TO_THREAD(state);
2610                 if (owner != NULL) {
2611                         wait_for_event();
2612                         state = ordered_load_mtx(lock);
2613                 } else {
2614                         atomic_exchange_abort();
2615                 }
2616
2617                 loopcount++;
2618         } while (TRUE);
2619
2620 done_spinning:
2621 #if     CONFIG_DTRACE
2622         /*
2623          * Note that we record a different probe id depending on whether
2624          * this is a direct or indirect mutex.  This allows us to
2625          * penalize only lock groups that have debug/stats enabled
2626          * with dtrace processing if desired.
2627          */
2628         if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) {
2629                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lock,
2630                     mach_absolute_time() - start_time);
2631         } else {
2632                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lock,
2633                     mach_absolute_time() - start_time);
2634         }
2635         /* The lockstat acquire event is recorded by the caller. */
2636 #endif
2637
2638         state = ordered_load_mtx(lock);
2639
2640         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2641             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, retval, 0);
2642         if ((!has_interlock) && (retval != SPINWAIT_ACQUIRED)) {
2643                 /* We must own either the lock or the interlock on return. */
2644                 interlock_lock(lock);
2645         }
2646
2647         return retval;
2648 }
2649
2650
2651 /*
2652  *      Common code for mutex locking as spinlock
2653  */
2654 static inline void
2655 lck_mtx_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2656 {
2657         uintptr_t       state;
2658
2659         interlock_lock(lock);
2660         state = ordered_load_mtx(lock);
2661         if (LCK_MTX_STATE_TO_THREAD(state)) {
2662                 if (allow_held_as_mutex) {
2663                         lck_mtx_lock_contended(lock, current_thread(), TRUE);
2664                 } else {
2665                         // "Always" variants can never block. If the lock is held and blocking is not allowed
2666                         // then someone is mixing always and non-always calls on the same lock, which is
2667                         // forbidden.
2668                         panic("Attempting to block on a lock taken as spin-always %p", lock);
2669                 }
2670                 return;
2671         }
2672         state &= ARM_LCK_WAITERS;                                               // Preserve waiters bit
2673         state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK);        // Add spin tag and maintain interlock
2674         ordered_store_mtx(lock, state);
2675         load_memory_barrier();
2676
2677 #if     CONFIG_DTRACE
2678         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
2679 #endif /* CONFIG_DTRACE */
2680 }
2681
2682 /*
2683  *      Routine:        lck_mtx_lock_spin
2684  */
2685 void
2686 lck_mtx_lock_spin(lck_mtx_t *lock)
2687 {
2688         lck_mtx_check_preemption(lock);
2689         lck_mtx_lock_spin_internal(lock, TRUE);
2690 }
2691
2692 /*
2693  *      Routine:        lck_mtx_lock_spin_always
2694  */
2695 void
2696 lck_mtx_lock_spin_always(lck_mtx_t *lock)
2697 {
2698         lck_mtx_lock_spin_internal(lock, FALSE);
2699 }
2700
2701 /*
2702  *      Routine:        lck_mtx_try_lock
2703  */
2704 boolean_t
2705 lck_mtx_try_lock(lck_mtx_t *lock)
2706 {
2707         thread_t        thread = current_thread();
2708
2709         lck_mtx_verify(lock);
2710         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2711             0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2712 #if     CONFIG_DTRACE
2713                 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
2714 #endif /* CONFIG_DTRACE */
2715                 return TRUE;
2716         }
2717         return lck_mtx_try_lock_contended(lock, thread);
2718 }
2719
2720 static boolean_t NOINLINE
2721 lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
2722 {
2723         thread_t        holding_thread;
2724         uintptr_t       state;
2725         int             waiters;
2726
2727         interlock_lock(lock);
2728         state = ordered_load_mtx(lock);
2729         holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2730         if (holding_thread) {
2731                 interlock_unlock(lock);
2732                 return FALSE;
2733         }
2734         waiters = lck_mtx_lock_acquire(lock, NULL);
2735         state = LCK_MTX_THREAD_TO_STATE(thread);
2736         if (waiters != 0) {
2737                 state |= ARM_LCK_WAITERS;
2738         }
2739         state |= LCK_ILOCK;                             // Preserve interlock
2740         ordered_store_mtx(lock, state); // Set ownership
2741         interlock_unlock(lock);                 // Release interlock, enable preemption
2742         load_memory_barrier();
2743
2744         turnstile_cleanup();
2745
2746         return TRUE;
2747 }
2748
2749 static inline boolean_t
2750 lck_mtx_try_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2751 {
2752         uintptr_t       state;
2753
2754         if (!interlock_try(lock)) {
2755                 return FALSE;
2756         }
2757         state = ordered_load_mtx(lock);
2758         if (LCK_MTX_STATE_TO_THREAD(state)) {
2759                 // Lock is held as mutex
2760                 if (allow_held_as_mutex) {
2761                         interlock_unlock(lock);
2762                 } else {
2763                         // "Always" variants can never block. If the lock is held as a normal mutex
2764                         // then someone is mixing always and non-always calls on the same lock, which is
2765                         // forbidden.
2766                         panic("Spin-mutex held as full mutex %p", lock);
2767                 }
2768                 return FALSE;
2769         }
2770         state &= ARM_LCK_WAITERS;                                               // Preserve waiters bit
2771         state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK);        // Add spin tag and maintain interlock
2772         ordered_store_mtx(lock, state);
2773         load_memory_barrier();
2774
2775 #if     CONFIG_DTRACE
2776         LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
2777 #endif /* CONFIG_DTRACE */
2778         return TRUE;
2779 }
2780
2781 /*
2782  *      Routine: lck_mtx_try_lock_spin
2783  */
2784 boolean_t
2785 lck_mtx_try_lock_spin(lck_mtx_t *lock)
2786 {
2787         return lck_mtx_try_lock_spin_internal(lock, TRUE);
2788 }
2789
2790 /*
2791  *      Routine: lck_mtx_try_lock_spin_always
2792  */
2793 boolean_t
2794 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
2795 {
2796         return lck_mtx_try_lock_spin_internal(lock, FALSE);
2797 }
2798
2799
2800
2801 /*
2802  *      Routine:        lck_mtx_unlock
2803  */
2804 void
2805 lck_mtx_unlock(lck_mtx_t *lock)
2806 {
2807         thread_t        thread = current_thread();
2808         uintptr_t       state;
2809         boolean_t       ilk_held = FALSE;
2810
2811         lck_mtx_verify(lock);
2812
2813         state = ordered_load_mtx(lock);
2814         if (state & LCK_ILOCK) {
2815                 if (LCK_MTX_STATE_TO_THREAD(state) == (thread_t)LCK_MTX_SPIN_TAG) {
2816                         ilk_held = TRUE;        // Interlock is held by (presumably) this thread
2817                 }
2818                 goto slow_case;
2819         }
2820         // Locked as a mutex
2821         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2822             LCK_MTX_THREAD_TO_STATE(thread), 0, release)) {
2823 #if     CONFIG_DTRACE
2824                 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2825 #endif /* CONFIG_DTRACE */
2826                 return;
2827         }
2828 slow_case:
2829         lck_mtx_unlock_contended(lock, thread, ilk_held);
2830 }
2831
2832 static void NOINLINE
2833 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
2834 {
2835         uintptr_t       state;
2836         boolean_t               cleanup = FALSE;
2837
2838         if (ilk_held) {
2839                 state = ordered_load_mtx(lock);
2840         } else {
2841                 interlock_lock(lock);
2842                 state = ordered_load_mtx(lock);
2843                 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
2844                         panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
2845                 }
2846                 if (state & ARM_LCK_WAITERS) {
2847                         if (lck_mtx_unlock_wakeup(lock, thread)) {
2848                                 state = ARM_LCK_WAITERS;
2849                         } else {
2850                                 state = 0;
2851                         }
2852                         cleanup = TRUE;
2853                         goto unlock;
2854                 }
2855         }
2856         state &= ARM_LCK_WAITERS;   /* Clear state, retain waiters bit */
2857 unlock:
2858         state |= LCK_ILOCK;
2859         ordered_store_mtx(lock, state);
2860         interlock_unlock(lock);
2861         if (cleanup) {
2862                 /*
2863                  * Do not do any turnstile operations outside of this block.
2864                  * lock/unlock is called at early stage of boot with single thread,
2865                  * when turnstile is not yet initialized.
2866                  * Even without contention we can come throught the slow path
2867                  * if the mutex is acquired as a spin lock.
2868                  */
2869                 turnstile_cleanup();
2870         }
2871
2872 #if     CONFIG_DTRACE
2873         LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2874 #endif /* CONFIG_DTRACE */
2875 }
2876
2877 /*
2878  *      Routine:        lck_mtx_assert
2879  */
2880 void
2881 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
2882 {
2883         thread_t        thread, holder;
2884         uintptr_t       state;
2885
2886         state = ordered_load_mtx(lock);
2887         holder = LCK_MTX_STATE_TO_THREAD(state);
2888         if (holder == (thread_t)LCK_MTX_SPIN_TAG) {
2889                 // Lock is held in spin mode, owner is unknown.
2890                 return; // Punt
2891         }
2892         thread = current_thread();
2893         if (type == LCK_MTX_ASSERT_OWNED) {
2894                 if (thread != holder) {
2895                         panic("lck_mtx_assert(): mutex (%p) owned", lock);
2896                 }
2897         } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
2898                 if (thread == holder) {
2899                         panic("lck_mtx_assert(): mutex (%p) not owned", lock);
2900                 }
2901         } else {
2902                 panic("lck_mtx_assert(): invalid arg (%u)", type);
2903         }
2904 }
2905
2906 /*
2907  *      Routine:        lck_mtx_ilk_unlock
2908  */
2909 boolean_t
2910 lck_mtx_ilk_unlock(lck_mtx_t *lock)
2911 {
2912         interlock_unlock(lock);
2913         return TRUE;
2914 }
2915
2916 /*
2917  *      Routine:        lck_mtx_convert_spin
2918  *
2919  *      Convert a mutex held for spin into a held full mutex
2920  */
2921 void
2922 lck_mtx_convert_spin(lck_mtx_t *lock)
2923 {
2924         thread_t        thread = current_thread();
2925         uintptr_t       state;
2926         int                     waiters;
2927
2928         state = ordered_load_mtx(lock);
2929         if (LCK_MTX_STATE_TO_THREAD(state) == thread) {
2930                 return;         // Already owned as mutex, return
2931         }
2932         if ((state & LCK_ILOCK) == 0 || (LCK_MTX_STATE_TO_THREAD(state) != (thread_t)LCK_MTX_SPIN_TAG)) {
2933                 panic("lck_mtx_convert_spin: Not held as spinlock (%p)", lock);
2934         }
2935         state &= ~(LCK_MTX_THREAD_MASK);                // Clear the spin tag
2936         ordered_store_mtx(lock, state);
2937         waiters = lck_mtx_lock_acquire(lock, NULL);   // Acquire to manage priority boosts
2938         state = LCK_MTX_THREAD_TO_STATE(thread);
2939         if (waiters != 0) {
2940                 state |= ARM_LCK_WAITERS;
2941         }
2942         state |= LCK_ILOCK;
2943         ordered_store_mtx(lock, state);                 // Set ownership
2944         interlock_unlock(lock);                                 // Release interlock, enable preemption
2945         turnstile_cleanup();
2946 }
2947
2948
2949 /*
2950  *      Routine:        lck_mtx_destroy
2951  */
2952 void
2953 lck_mtx_destroy(
2954         lck_mtx_t * lck,
2955         lck_grp_t * grp)
2956 {
2957         if (lck->lck_mtx_type != LCK_MTX_TYPE) {
2958                 panic("Destroying invalid mutex %p", lck);
2959         }
2960         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2961                 panic("Destroying previously destroyed lock %p", lck);
2962         }
2963         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2964         lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
2965         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2966         lck_grp_deallocate(grp);
2967         return;
2968 }
2969
2970 /*
2971  *      Routine:        lck_spin_assert
2972  */
2973 void
2974 lck_spin_assert(lck_spin_t *lock, unsigned int type)
2975 {
2976         thread_t        thread, holder;
2977         uintptr_t       state;
2978
2979         if (lock->type != LCK_SPIN_TYPE) {
2980                 panic("Invalid spinlock %p", lock);
2981         }
2982
2983         state = lock->lck_spin_data;
2984         holder = (thread_t)(state & ~LCK_ILOCK);
2985         thread = current_thread();
2986         if (type == LCK_ASSERT_OWNED) {
2987                 if (holder == 0) {
2988                         panic("Lock not owned %p = %lx", lock, state);
2989                 }
2990                 if (holder != thread) {
2991                         panic("Lock not owned by current thread %p = %lx", lock, state);
2992                 }
2993                 if ((state & LCK_ILOCK) == 0) {
2994                         panic("Lock bit not set %p = %lx", lock, state);
2995                 }
2996         } else if (type == LCK_ASSERT_NOTOWNED) {
2997                 if (holder != 0) {
2998                         if (holder == thread) {
2999                                 panic("Lock owned by current thread %p = %lx", lock, state);
3000                         }
3001                 }
3002         } else {
3003                 panic("lck_spin_assert(): invalid arg (%u)", type);
3004         }
3005 }
3006
3007 boolean_t
3008 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
3009 {
3010         lck_rw_word_t   word;
3011
3012         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
3013
3014         word.data = ordered_load_rw(lck);
3015         if (word.want_excl || word.want_upgrade || force_yield) {
3016                 lck_rw_unlock_shared(lck);
3017                 mutex_pause(2);
3018                 lck_rw_lock_shared(lck);
3019                 return TRUE;
3020         }
3021
3022         return FALSE;
3023 }
3024
3025 /*
3026  * Routine: kdp_lck_mtx_lock_spin_is_acquired
3027  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3028  */
3029 boolean_t
3030 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3031 {
3032         uintptr_t       state;
3033
3034         if (not_in_kdp) {
3035                 panic("panic: spinlock acquired check done outside of kernel debugger");
3036         }
3037         state = ordered_load_mtx(lck);
3038         if (state == LCK_MTX_TAG_DESTROYED) {
3039                 return FALSE;
3040         }
3041         if (LCK_MTX_STATE_TO_THREAD(state) || (state & LCK_ILOCK)) {
3042                 return TRUE;
3043         }
3044         return FALSE;
3045 }
3046
3047 void
3048 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3049 {
3050         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3051         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3052         uintptr_t state   = ordered_load_mtx(mutex);
3053         thread_t holder   = LCK_MTX_STATE_TO_THREAD(state);
3054         if ((uintptr_t)holder == (uintptr_t)LCK_MTX_SPIN_TAG) {
3055                 waitinfo->owner = STACKSHOT_WAITOWNER_MTXSPIN;
3056         } else {
3057                 assertf(state != (uintptr_t)LCK_MTX_TAG_DESTROYED, "state=0x%llx", (uint64_t)state);
3058                 assertf(state != (uintptr_t)LCK_MTX_TAG_INDIRECT, "state=0x%llx", (uint64_t)state);
3059                 waitinfo->owner = thread_tid(holder);
3060         }
3061 }
3062
3063 void
3064 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3065 {
3066         lck_rw_t        *rwlck = NULL;
3067         switch (waitinfo->wait_type) {
3068         case kThreadWaitKernelRWLockRead:
3069                 rwlck = READ_EVENT_TO_RWLOCK(event);
3070                 break;
3071         case kThreadWaitKernelRWLockWrite:
3072         case kThreadWaitKernelRWLockUpgrade:
3073                 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3074                 break;
3075         default:
3076                 panic("%s was called with an invalid blocking type", __FUNCTION__);
3077                 break;
3078         }
3079         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3080         waitinfo->owner = thread_tid(rwlck->lck_rw_owner);
3081 }